diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021a..ac0cb549d020b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); assert(VT.getVectorElementType().bitsEq(MVT::i32)); - unsigned RegClassID = - SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); - SelectBuildVector(N, RegClassID); + const TargetRegisterClass *RegClass = + N->isDivergent() + ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32) + : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32); + + SelectBuildVector(N, RegClass->getID()); return; } case ISD::VECTOR_SHUFFLE: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f72e1c7..c902b7e7f1d87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -45,21 +45,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { return false; } -// TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - SDLoc SL(N); - uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); - return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), - DAG.getTargetConstant(K, SL, MVT::i32)); - } - - return nullptr; -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -115,6 +100,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; SDNode *glueCopyToM0LDSInit(SDNode *N) const; diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index bc341f2baa804..e882769f97ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -95,13 +95,13 @@ define void @flat_atomic_cmpxchg_i32_ret_a_a__a(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a1 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -126,13 +126,13 @@ define void @flat_atomic_cmpxchg_i32_ret_a_a__v(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a1 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -156,12 +156,14 @@ define void @flat_atomic_cmpxchg_i32_ret_v_a__v(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v2 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -299,12 +301,13 @@ define void @flat_atomic_cmpxchg_i32_ret_av_a__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -328,12 +331,13 @@ define void @flat_atomic_cmpxchg_i32_ret_a_av__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -533,50 +537,55 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[4:5] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB15_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB15_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; CHECK-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_accvgpr_write_b32 a1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -598,50 +607,55 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__v: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: .LBB16_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB16_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; CHECK-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB16_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] +; CHECK-NEXT: ; use v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -658,48 +672,51 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_a__v: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: .LBB17_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB17_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; CHECK-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc +; CHECK-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB17_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] +; CHECK-NEXT: ; use v[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -716,48 +733,51 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_v__v: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB18_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB18_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB18_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; CHECK-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB18_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] +; CHECK-NEXT: ; use v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -947,48 +967,51 @@ define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_a__av: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB22_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: .LBB22_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB22_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; CHECK-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc +; CHECK-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB22_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] +; CHECK-NEXT: ; use v[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1005,48 +1028,51 @@ define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 { ; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_av__av: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB23_2 ; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc +; CHECK-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[0:3] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_invl2 ; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB23_2: ; %Flow ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB23_4 ; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; CHECK-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: .LBB23_4: ; %atomicrmw.phi ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] +; CHECK-NEXT: ; use v[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index ae53bdff7c251..5cceb918b755e 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -2531,16 +2531,16 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -2779,16 +2779,16 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -3138,16 +3138,16 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 -; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -4332,15 +4332,15 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4520,15 +4520,15 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4789,15 +4789,15 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -6704,15 +6704,15 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -6905,17 +6905,17 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, v1, v6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_not_b32_e32 v2, v3 -; GFX90A-NEXT: v_not_b32_e32 v3, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, v2, v7 +; GFX90A-NEXT: v_not_b32_e32 v1, v1 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -7147,15 +7147,15 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX90A-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] @@ -8787,15 +8787,15 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v6 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v7, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -9848,9 +9848,8 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen @@ -10117,9 +10116,8 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen @@ -10338,10 +10336,9 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10527,10 +10524,9 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10733,16 +10729,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -11001,16 +10996,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -13378,27 +13372,28 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v0, v1, v4 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13406,26 +13401,27 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v0, v1, v4, v1 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB172_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -14144,28 +14140,29 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14173,29 +14170,30 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX950-NEXT: v_sub_u32_e32 v0, v1, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB190_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -14277,26 +14275,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14304,26 +14303,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v0, v1, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB192_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -14894,14 +14894,14 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end @@ -15088,17 +15088,17 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_not_b32_e32 v2, v3 -; GFX90A-NEXT: v_not_b32_e32 v3, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX90A-NEXT: v_not_b32_e32 v1, v1 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART @@ -15323,14 +15323,14 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX90A-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end @@ -15502,14 +15502,14 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, v4, v1 ; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end @@ -17091,15 +17091,15 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -17533,26 +17533,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17560,26 +17561,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17971,26 +17973,27 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -18106,26 +18109,27 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -18180,9 +18184,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen @@ -18429,9 +18432,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -18643,10 +18645,9 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -18826,10 +18827,9 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -19025,16 +19025,15 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -19285,16 +19284,15 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -19559,26 +19557,27 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB248_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19675,26 +19674,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19702,26 +19702,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB250_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 @@ -20125,26 +20126,27 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB256_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 @@ -20268,26 +20270,27 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB258_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll index 063feec759efa..6f1cb79e66423 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll @@ -95,13 +95,13 @@ define void @global_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a1 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -126,13 +126,13 @@ define void @global_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a1 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -156,12 +156,14 @@ define void @global_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def v2 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v2 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -299,12 +301,13 @@ define void @global_atomic_cmpxchg_i32_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -328,12 +331,13 @@ define void @global_atomic_cmpxchg_i32_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 +; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 +; CHECK-NEXT: ; def a0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:40 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -449,13 +453,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -483,13 +487,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] +; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: buffer_wbl2 ; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -515,8 +519,8 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -545,8 +549,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND @@ -661,8 +665,8 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:5] ; CHECK-NEXT: ;;#ASMEND @@ -691,8 +695,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d3ebd92f0677b..8e92dc80fe1d6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -7459,53 +7459,81 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s22, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_writelane_b32 v21, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s23, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: v_readfirstlane_b32 s14, v7 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 @@ -7517,665 +7545,665 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_writelane_b32 v21, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 40 -; SI-NEXT: v_writelane_b32 v22, s47, 41 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 38 -; SI-NEXT: v_writelane_b32 v22, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 36 -; SI-NEXT: v_writelane_b32 v22, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 34 -; SI-NEXT: v_writelane_b32 v22, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 32 -; SI-NEXT: v_writelane_b32 v22, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 30 -; SI-NEXT: v_writelane_b32 v22, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 28 -; SI-NEXT: v_writelane_b32 v22, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 26 -; SI-NEXT: v_writelane_b32 v22, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 24 -; SI-NEXT: v_writelane_b32 v22, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 22 -; SI-NEXT: v_writelane_b32 v22, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 20 -; SI-NEXT: v_writelane_b32 v22, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 18 -; SI-NEXT: v_writelane_b32 v22, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 16 -; SI-NEXT: v_writelane_b32 v22, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 14 -; SI-NEXT: v_writelane_b32 v22, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 12 -; SI-NEXT: v_writelane_b32 v22, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 10 -; SI-NEXT: v_writelane_b32 v22, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 6 -; SI-NEXT: v_writelane_b32 v22, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 4 -; SI-NEXT: v_writelane_b32 v22, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 2 -; SI-NEXT: v_writelane_b32 v22, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_lshr_b32 s26, s5, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 43 +; SI-NEXT: s_lshr_b32 s26, s5, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: s_lshr_b32 s26, s7, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 45 +; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 47 +; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 49 +; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 50 +; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 51 +; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 53 +; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 55 +; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 57 +; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 59 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 61 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 63 +; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 0 +; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 1 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 2 +; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 3 +; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 4 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 5 +; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 6 +; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 7 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 8 +; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 9 +; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 10 +; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 11 +; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 12 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 13 +; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 14 +; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 15 +; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 16 +; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 38 +; SI-NEXT: v_writelane_b32 v23, s27, 39 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 36 +; SI-NEXT: v_writelane_b32 v23, s27, 37 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 34 +; SI-NEXT: v_writelane_b32 v23, s27, 35 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 32 +; SI-NEXT: v_writelane_b32 v23, s27, 33 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 30 +; SI-NEXT: v_writelane_b32 v23, s27, 31 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 28 +; SI-NEXT: v_writelane_b32 v23, s27, 29 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 26 +; SI-NEXT: v_writelane_b32 v23, s27, 27 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 24 +; SI-NEXT: v_writelane_b32 v23, s27, 25 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 22 +; SI-NEXT: v_writelane_b32 v23, s27, 23 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 20 +; SI-NEXT: v_writelane_b32 v23, s27, 21 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 18 +; SI-NEXT: v_writelane_b32 v23, s27, 19 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 16 +; SI-NEXT: v_writelane_b32 v23, s27, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 14 +; SI-NEXT: v_writelane_b32 v23, s27, 15 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 12 +; SI-NEXT: v_writelane_b32 v23, s27, 13 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 10 +; SI-NEXT: v_writelane_b32 v23, s27, 11 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 9 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 6 +; SI-NEXT: v_writelane_b32 v23, s27, 7 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 4 +; SI-NEXT: v_writelane_b32 v23, s27, 5 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 2 +; SI-NEXT: v_writelane_b32 v23, s27, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 0 ; SI-NEXT: s_lshr_b32 s50, s9, 24 -; SI-NEXT: s_lshr_b32 s51, s21, 8 -; SI-NEXT: s_lshr_b32 s48, s19, 24 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s19, 8 -; SI-NEXT: s_lshr_b32 s54, s17, 24 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s49, s17, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s51, s45, 8 +; SI-NEXT: s_lshr_b32 s48, s47, 24 +; SI-NEXT: s_lshr_b32 s52, s47, 16 +; SI-NEXT: s_lshr_b32 s53, s47, 8 +; SI-NEXT: s_lshr_b32 s54, s57, 24 +; SI-NEXT: s_lshr_b32 s55, s57, 16 +; SI-NEXT: s_lshr_b32 s49, s57, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 40 -; SI-NEXT: v_writelane_b32 v22, s47, 41 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 38 -; SI-NEXT: v_writelane_b32 v22, s47, 39 -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 38 +; SI-NEXT: v_writelane_b32 v23, s27, 39 +; SI-NEXT: s_lshr_b32 s26, s5, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: s_lshr_b32 s26, s5, 16 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 43 +; SI-NEXT: s_lshr_b32 s26, s5, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: s_lshr_b32 s26, s7, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 45 +; SI-NEXT: s_lshr_b32 s26, s7, 16 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 47 +; SI-NEXT: s_lshr_b32 s26, s9, 16 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 49 +; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 50 +; SI-NEXT: s_lshr_b32 s26, s11, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 51 +; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 53 +; SI-NEXT: s_lshr_b32 s26, s13, 16 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 55 +; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 57 +; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 59 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 61 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 63 +; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 0 +; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 1 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_writelane_b32 v22, s26, 2 +; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 3 +; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 4 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_writelane_b32 v22, s26, 5 +; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 6 +; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 7 +; SI-NEXT: s_lshr_b32 s26, s25, 16 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 8 +; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 9 +; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 10 +; SI-NEXT: s_lshr_b32 s26, s41, 16 ; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 11 +; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 12 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 13 +; SI-NEXT: s_lshr_b32 s26, s43, 16 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_writelane_b32 v21, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: v_writelane_b32 v21, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_writelane_b32 v21, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_writelane_b32 v21, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_writelane_b32 v21, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v22, s26, 14 +; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 15 +; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 16 +; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 36 -; SI-NEXT: v_writelane_b32 v22, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 34 -; SI-NEXT: v_writelane_b32 v22, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 32 -; SI-NEXT: v_writelane_b32 v22, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 36 +; SI-NEXT: v_writelane_b32 v23, s27, 37 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 34 +; SI-NEXT: v_writelane_b32 v23, s27, 35 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 32 +; SI-NEXT: v_writelane_b32 v23, s27, 33 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 ; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 30 -; SI-NEXT: v_writelane_b32 v22, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 28 -; SI-NEXT: v_writelane_b32 v22, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 26 -; SI-NEXT: v_writelane_b32 v22, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 30 +; SI-NEXT: v_writelane_b32 v23, s27, 31 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 28 +; SI-NEXT: v_writelane_b32 v23, s27, 29 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 26 +; SI-NEXT: v_writelane_b32 v23, s27, 27 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 24 -; SI-NEXT: v_writelane_b32 v22, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 22 -; SI-NEXT: v_writelane_b32 v22, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 20 -; SI-NEXT: v_writelane_b32 v22, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 24 +; SI-NEXT: v_writelane_b32 v23, s27, 25 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 22 +; SI-NEXT: v_writelane_b32 v23, s27, 23 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 20 +; SI-NEXT: v_writelane_b32 v23, s27, 21 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 18 -; SI-NEXT: v_writelane_b32 v22, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 16 -; SI-NEXT: v_writelane_b32 v22, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 14 -; SI-NEXT: v_writelane_b32 v22, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 18 +; SI-NEXT: v_writelane_b32 v23, s27, 19 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 16 +; SI-NEXT: v_writelane_b32 v23, s27, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 14 +; SI-NEXT: v_writelane_b32 v23, s27, 15 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 ; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 12 -; SI-NEXT: v_writelane_b32 v22, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 10 -; SI-NEXT: v_writelane_b32 v22, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 6 -; SI-NEXT: v_writelane_b32 v22, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 4 -; SI-NEXT: v_writelane_b32 v22, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 2 -; SI-NEXT: v_writelane_b32 v22, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 12 +; SI-NEXT: v_writelane_b32 v23, s27, 13 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 10 +; SI-NEXT: v_writelane_b32 v23, s27, 11 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 9 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 6 +; SI-NEXT: v_writelane_b32 v23, s27, 7 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 4 +; SI-NEXT: v_writelane_b32 v23, s27, 5 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 2 +; SI-NEXT: v_writelane_b32 v23, s27, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 0 ; SI-NEXT: s_lshr_b32 s50, s9, 24 -; SI-NEXT: s_lshr_b32 s51, s21, 8 -; SI-NEXT: s_lshr_b32 s48, s19, 24 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s19, 8 -; SI-NEXT: s_lshr_b32 s54, s17, 24 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s49, s17, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s51, s45, 8 +; SI-NEXT: s_lshr_b32 s48, s47, 24 +; SI-NEXT: s_lshr_b32 s52, s47, 16 +; SI-NEXT: s_lshr_b32 s53, s47, 8 +; SI-NEXT: s_lshr_b32 s54, s57, 24 +; SI-NEXT: s_lshr_b32 s55, s57, 16 +; SI-NEXT: s_lshr_b32 s49, s57, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s47, s38, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: s_and_b32 s47, s36, 0xff -; SI-NEXT: s_lshl_b32 s57, s34, 24 -; SI-NEXT: s_lshl_b32 s47, s47, 16 -; SI-NEXT: s_or_b32 s47, s57, s47 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s49, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s55, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s47, s54, 24 -; SI-NEXT: s_or_b32 s17, s47, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s30, 8 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s94, 0xff -; SI-NEXT: s_lshl_b32 s18, s92, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xff -; SI-NEXT: s_lshl_b32 s17, s53, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s52, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s48, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s90, 8 -; SI-NEXT: s_and_b32 s17, s20, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s88, 0xff -; SI-NEXT: s_lshl_b32 s18, s78, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xff -; SI-NEXT: s_lshl_b32 s17, s51, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 17 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 16 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: s_lshl_b32 s16, s76, 8 -; SI-NEXT: s_and_b32 s17, s22, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s74, 0xff -; SI-NEXT: s_lshl_b32 s18, s72, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 15 -; SI-NEXT: v_mov_b32_e32 v7, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 14 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 13 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: s_lshl_b32 s16, s62, 8 -; SI-NEXT: s_and_b32 s17, s24, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s60, 0xff -; SI-NEXT: s_lshl_b32 s18, s58, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 12 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 11 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 10 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: s_lshl_b32 s16, s56, 8 -; SI-NEXT: s_and_b32 s17, s26, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s46, 0xff -; SI-NEXT: s_lshl_b32 s18, s98, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 9 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 8 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 7 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: s_lshl_b32 s16, s96, 8 -; SI-NEXT: s_and_b32 s17, s28, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s86, 0xff -; SI-NEXT: s_lshl_b32 s18, s84, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 6 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 5 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 4 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: s_lshl_b32 s16, s82, 8 -; SI-NEXT: s_and_b32 s17, s44, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s80, 0xff -; SI-NEXT: s_lshl_b32 s18, s70, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 3 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 2 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 1 +; SI-NEXT: s_lshl_b32 s27, s38, 8 +; SI-NEXT: s_and_b32 s29, s56, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s36, 0xff +; SI-NEXT: s_lshl_b32 s56, s34, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xff +; SI-NEXT: s_lshl_b32 s29, s49, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s55, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s56, s54, 24 +; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s30, 8 +; SI-NEXT: s_and_b32 s29, s46, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s94, 0xff +; SI-NEXT: s_lshl_b32 s46, s92, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xff +; SI-NEXT: s_lshl_b32 s29, s53, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s52, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s46, s48, 24 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s90, 8 +; SI-NEXT: s_and_b32 s29, s44, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s88, 0xff +; SI-NEXT: s_lshl_b32 s44, s78, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xff +; SI-NEXT: s_lshl_b32 s29, s51, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 17 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s44, v22, 16 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s44, s44, 24 +; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v6, s27 +; SI-NEXT: s_lshl_b32 s27, s76, 8 +; SI-NEXT: s_and_b32 s29, s42, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s74, 0xff +; SI-NEXT: s_lshl_b32 s42, s72, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 15 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 14 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s42, v22, 13 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s42, s42, 24 +; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v8, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 8 +; SI-NEXT: s_and_b32 s29, s40, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s60, 0xff +; SI-NEXT: s_lshl_b32 s40, s58, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 12 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 11 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s40, v22, 10 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s40, s40, 24 +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: s_lshl_b32 s27, s28, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_lshl_b32 s27, s98, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xff +; SI-NEXT: v_readlane_b32 s25, v22, 9 +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_readlane_b32 s25, v22, 8 +; SI-NEXT: s_and_b32 s25, s25, 0xff +; SI-NEXT: v_readlane_b32 s26, v22, 7 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_lshl_b32 s26, s26, 24 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: s_lshl_b32 s24, s96, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: s_and_b32 s24, s86, 0xff +; SI-NEXT: s_lshl_b32 s25, s84, 24 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xff +; SI-NEXT: v_readlane_b32 s23, v22, 6 +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_readlane_b32 s23, v22, 5 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: v_readlane_b32 s24, v22, 4 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_lshl_b32 s24, s24, 24 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: s_lshl_b32 s22, s82, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_and_b32 s22, s80, 0xff +; SI-NEXT: s_lshl_b32 s23, s70, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xff +; SI-NEXT: v_readlane_b32 s21, v22, 3 +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: v_readlane_b32 s21, v22, 2 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: v_readlane_b32 s22, v22, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: s_lshl_b32 s16, s68, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: s_lshl_b32 s20, s68, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s66, 0xff +; SI-NEXT: s_or_b32 s18, s18, s20 +; SI-NEXT: s_and_b32 s20, s66, 0xff ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s64, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s64, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v21, 0 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 0 ; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 63 +; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 62 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_readlane_b32 s19, v23, 63 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: v_readlane_b32 s20, v23, 62 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s20, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v22, 3 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 4 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_readlane_b32 s18, v23, 0 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s19, v23, 1 +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_readlane_b32 s18, v23, 2 +; SI-NEXT: v_readlane_b32 s19, v23, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v23, 4 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s20, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 61 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v23, 61 ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 60 +; SI-NEXT: v_readlane_b32 s17, v23, 60 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 59 +; SI-NEXT: v_readlane_b32 s18, v23, 59 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -8185,16 +8213,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v22, 6 +; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v22, 7 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v22, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v22, 8 -; SI-NEXT: v_readlane_b32 s17, v22, 9 +; SI-NEXT: v_readlane_b32 s16, v23, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: v_readlane_b32 s18, v23, 10 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -8205,12 +8232,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 58 +; SI-NEXT: v_readlane_b32 s15, v23, 58 ; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v22, 57 +; SI-NEXT: v_readlane_b32 s15, v23, 57 ; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 56 +; SI-NEXT: v_readlane_b32 s16, v23, 56 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -8220,15 +8247,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v22, 12 +; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 13 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v22, 14 -; SI-NEXT: v_readlane_b32 s15, v22, 15 +; SI-NEXT: v_readlane_b32 s14, v23, 14 +; SI-NEXT: v_readlane_b32 s15, v23, 15 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: v_readlane_b32 s16, v23, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -8239,12 +8266,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 55 +; SI-NEXT: v_readlane_b32 s13, v23, 55 ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v22, 54 +; SI-NEXT: v_readlane_b32 s13, v23, 54 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 53 +; SI-NEXT: v_readlane_b32 s14, v23, 53 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -8254,15 +8281,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v22, 18 +; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 19 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v22, 20 -; SI-NEXT: v_readlane_b32 s13, v22, 21 +; SI-NEXT: v_readlane_b32 s12, v23, 20 +; SI-NEXT: v_readlane_b32 s13, v23, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: v_readlane_b32 s14, v23, 22 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -8273,12 +8300,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 52 +; SI-NEXT: v_readlane_b32 s11, v23, 52 ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v22, 51 +; SI-NEXT: v_readlane_b32 s11, v23, 51 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 50 +; SI-NEXT: v_readlane_b32 s12, v23, 50 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -8288,15 +8315,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v22, 24 +; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 25 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v22, 26 -; SI-NEXT: v_readlane_b32 s11, v22, 27 +; SI-NEXT: v_readlane_b32 s10, v23, 26 +; SI-NEXT: v_readlane_b32 s11, v23, 27 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: v_readlane_b32 s12, v23, 28 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -8307,10 +8334,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 49 +; SI-NEXT: v_readlane_b32 s9, v23, 49 ; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v22, 48 +; SI-NEXT: v_readlane_b32 s9, v23, 48 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s50, 24 @@ -8321,15 +8348,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v22, 30 +; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 31 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v22, 32 -; SI-NEXT: v_readlane_b32 s9, v22, 33 +; SI-NEXT: v_readlane_b32 s8, v23, 32 +; SI-NEXT: v_readlane_b32 s9, v23, 33 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: v_readlane_b32 s10, v23, 34 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -8340,12 +8367,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v22, 47 +; SI-NEXT: v_readlane_b32 s7, v23, 47 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v22, 46 +; SI-NEXT: v_readlane_b32 s7, v23, 46 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 45 +; SI-NEXT: v_readlane_b32 s8, v23, 45 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -8355,15 +8382,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v22, 36 +; SI-NEXT: v_readlane_b32 s6, v23, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v22, 37 +; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: v_readlane_b32 s6, v22, 38 -; SI-NEXT: v_readlane_b32 s7, v22, 39 +; SI-NEXT: v_readlane_b32 s6, v23, 38 +; SI-NEXT: v_readlane_b32 s7, v23, 39 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 40 +; SI-NEXT: v_readlane_b32 s8, v23, 40 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -8374,12 +8401,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v22, 44 +; SI-NEXT: v_readlane_b32 s5, v23, 44 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v22, 43 +; SI-NEXT: v_readlane_b32 s5, v23, 43 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v22, 42 +; SI-NEXT: v_readlane_b32 s6, v23, 42 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -8389,206 +8416,207 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s19, v22, 11 -; SI-NEXT: v_readlane_b32 s17, v22, 17 -; SI-NEXT: v_readlane_b32 s15, v22, 23 -; SI-NEXT: v_readlane_b32 s13, v22, 29 -; SI-NEXT: v_readlane_b32 s11, v22, 35 -; SI-NEXT: v_readlane_b32 s9, v22, 41 +; SI-NEXT: v_readlane_b32 s21, v23, 5 +; SI-NEXT: v_readlane_b32 s19, v23, 11 +; SI-NEXT: v_readlane_b32 s17, v23, 17 +; SI-NEXT: v_readlane_b32 s15, v23, 23 +; SI-NEXT: v_readlane_b32 s13, v23, 29 +; SI-NEXT: v_readlane_b32 s11, v23, 35 +; SI-NEXT: v_readlane_b32 s9, v23, 41 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s99, v21, 35 +; SI-NEXT: v_readlane_b32 s98, v21, 34 +; SI-NEXT: v_readlane_b32 s97, v21, 33 +; SI-NEXT: v_readlane_b32 s96, v21, 32 +; SI-NEXT: v_readlane_b32 s87, v21, 31 +; SI-NEXT: v_readlane_b32 s86, v21, 30 +; SI-NEXT: v_readlane_b32 s85, v21, 29 +; SI-NEXT: v_readlane_b32 s84, v21, 28 +; SI-NEXT: v_readlane_b32 s83, v21, 27 +; SI-NEXT: v_readlane_b32 s82, v21, 26 +; SI-NEXT: v_readlane_b32 s81, v21, 25 +; SI-NEXT: v_readlane_b32 s80, v21, 24 +; SI-NEXT: v_readlane_b32 s71, v21, 23 +; SI-NEXT: v_readlane_b32 s70, v21, 22 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s50, 0 -; SI-NEXT: v_writelane_b32 v22, s51, 1 +; SI-NEXT: v_writelane_b32 v23, s50, 0 +; SI-NEXT: v_writelane_b32 v23, s51, 1 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 2 -; SI-NEXT: v_writelane_b32 v22, s51, 3 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 2 +; SI-NEXT: v_writelane_b32 v23, s51, 3 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 4 -; SI-NEXT: v_writelane_b32 v22, s51, 5 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 4 +; SI-NEXT: v_writelane_b32 v23, s51, 5 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 6 -; SI-NEXT: v_writelane_b32 v22, s51, 7 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 6 +; SI-NEXT: v_writelane_b32 v23, s51, 7 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 8 -; SI-NEXT: v_writelane_b32 v22, s51, 9 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 8 +; SI-NEXT: v_writelane_b32 v23, s51, 9 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 10 -; SI-NEXT: v_writelane_b32 v22, s51, 11 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 10 +; SI-NEXT: v_writelane_b32 v23, s51, 11 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 12 -; SI-NEXT: v_writelane_b32 v22, s51, 13 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 12 +; SI-NEXT: v_writelane_b32 v23, s51, 13 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 14 -; SI-NEXT: v_writelane_b32 v22, s51, 15 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 14 +; SI-NEXT: v_writelane_b32 v23, s51, 15 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 16 -; SI-NEXT: v_writelane_b32 v22, s51, 17 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 16 +; SI-NEXT: v_writelane_b32 v23, s51, 17 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 18 -; SI-NEXT: v_writelane_b32 v22, s51, 19 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 18 +; SI-NEXT: v_writelane_b32 v23, s51, 19 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 20 -; SI-NEXT: v_writelane_b32 v22, s51, 21 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 20 +; SI-NEXT: v_writelane_b32 v23, s51, 21 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 22 -; SI-NEXT: v_writelane_b32 v22, s51, 23 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 22 +; SI-NEXT: v_writelane_b32 v23, s51, 23 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 24 -; SI-NEXT: v_writelane_b32 v22, s51, 25 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 24 +; SI-NEXT: v_writelane_b32 v23, s51, 25 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 26 -; SI-NEXT: v_writelane_b32 v22, s51, 27 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 26 +; SI-NEXT: v_writelane_b32 v23, s51, 27 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 28 -; SI-NEXT: v_writelane_b32 v22, s51, 29 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 28 +; SI-NEXT: v_writelane_b32 v23, s51, 29 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 30 -; SI-NEXT: v_writelane_b32 v22, s51, 31 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 30 +; SI-NEXT: v_writelane_b32 v23, s51, 31 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 32 -; SI-NEXT: v_writelane_b32 v22, s51, 33 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 32 +; SI-NEXT: v_writelane_b32 v23, s51, 33 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 34 -; SI-NEXT: v_writelane_b32 v22, s51, 35 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 34 +; SI-NEXT: v_writelane_b32 v23, s51, 35 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 36 -; SI-NEXT: v_writelane_b32 v22, s51, 37 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 36 +; SI-NEXT: v_writelane_b32 v23, s51, 37 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 38 -; SI-NEXT: v_writelane_b32 v22, s51, 39 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 38 +; SI-NEXT: v_writelane_b32 v23, s51, 39 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 @@ -8604,9 +8632,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s50, 40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -8617,7 +8645,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: v_writelane_b32 v22, s51, 41 +; SI-NEXT: v_writelane_b32 v23, s51, 41 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: s_branch .LBB13_2 ; @@ -8625,47 +8653,75 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_writelane_b32 v20, s34, 2 -; VI-NEXT: v_writelane_b32 v20, s35, 3 -; VI-NEXT: v_writelane_b32 v20, s36, 4 -; VI-NEXT: v_writelane_b32 v20, s37, 5 -; VI-NEXT: v_writelane_b32 v20, s38, 6 -; VI-NEXT: v_writelane_b32 v20, s39, 7 -; VI-NEXT: v_writelane_b32 v20, s48, 8 -; VI-NEXT: v_writelane_b32 v20, s49, 9 -; VI-NEXT: v_writelane_b32 v20, s50, 10 -; VI-NEXT: v_writelane_b32 v20, s51, 11 -; VI-NEXT: v_writelane_b32 v20, s52, 12 -; VI-NEXT: v_writelane_b32 v20, s53, 13 -; VI-NEXT: v_writelane_b32 v20, s54, 14 -; VI-NEXT: v_writelane_b32 v20, s55, 15 -; VI-NEXT: v_writelane_b32 v20, s64, 16 -; VI-NEXT: v_writelane_b32 v20, s65, 17 -; VI-NEXT: v_writelane_b32 v20, s66, 18 -; VI-NEXT: v_writelane_b32 v20, s67, 19 -; VI-NEXT: v_writelane_b32 v20, s68, 20 -; VI-NEXT: v_writelane_b32 v20, s69, 21 -; VI-NEXT: v_writelane_b32 v20, s70, 22 -; VI-NEXT: v_writelane_b32 v20, s71, 23 -; VI-NEXT: v_writelane_b32 v20, s80, 24 -; VI-NEXT: v_writelane_b32 v20, s81, 25 -; VI-NEXT: v_writelane_b32 v20, s82, 26 -; VI-NEXT: v_writelane_b32 v20, s83, 27 -; VI-NEXT: v_writelane_b32 v20, s84, 28 -; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_writelane_b32 v21, s30, 0 +; VI-NEXT: v_writelane_b32 v21, s31, 1 +; VI-NEXT: v_writelane_b32 v21, s34, 2 +; VI-NEXT: v_writelane_b32 v21, s35, 3 +; VI-NEXT: v_writelane_b32 v21, s36, 4 +; VI-NEXT: v_writelane_b32 v21, s37, 5 +; VI-NEXT: v_writelane_b32 v21, s38, 6 +; VI-NEXT: v_writelane_b32 v21, s39, 7 +; VI-NEXT: v_writelane_b32 v21, s48, 8 +; VI-NEXT: v_writelane_b32 v21, s49, 9 +; VI-NEXT: v_writelane_b32 v21, s50, 10 +; VI-NEXT: v_writelane_b32 v21, s51, 11 +; VI-NEXT: v_writelane_b32 v21, s52, 12 +; VI-NEXT: v_writelane_b32 v21, s53, 13 +; VI-NEXT: v_writelane_b32 v21, s54, 14 +; VI-NEXT: v_writelane_b32 v21, s55, 15 +; VI-NEXT: v_writelane_b32 v21, s64, 16 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_writelane_b32 v21, s65, 17 +; VI-NEXT: v_readfirstlane_b32 s56, v20 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_writelane_b32 v21, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s57, v20 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_writelane_b32 v21, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s46, v20 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_writelane_b32 v21, s68, 20 +; VI-NEXT: v_readfirstlane_b32 s47, v20 +; VI-NEXT: v_mov_b32_e32 v20, s20 +; VI-NEXT: v_writelane_b32 v21, s69, 21 +; VI-NEXT: v_readfirstlane_b32 s44, v20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_writelane_b32 v21, s70, 22 +; VI-NEXT: v_readfirstlane_b32 s45, v20 +; VI-NEXT: v_mov_b32_e32 v20, s22 +; VI-NEXT: v_writelane_b32 v21, s71, 23 +; VI-NEXT: v_readfirstlane_b32 s42, v20 +; VI-NEXT: v_mov_b32_e32 v20, s23 +; VI-NEXT: v_writelane_b32 v21, s80, 24 +; VI-NEXT: v_readfirstlane_b32 s43, v20 +; VI-NEXT: v_mov_b32_e32 v20, s24 +; VI-NEXT: v_writelane_b32 v21, s81, 25 +; VI-NEXT: v_readfirstlane_b32 s40, v20 +; VI-NEXT: v_mov_b32_e32 v20, s25 +; VI-NEXT: v_writelane_b32 v21, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s41, v20 +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_writelane_b32 v21, s83, 27 +; VI-NEXT: v_readfirstlane_b32 s24, v20 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_writelane_b32 v21, s84, 28 +; VI-NEXT: v_readfirstlane_b32 s25, v20 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_writelane_b32 v21, s85, 29 +; VI-NEXT: v_readfirstlane_b32 s22, v20 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v20, s86, 30 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s45, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s43, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 -; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_writelane_b32 v21, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s23, v20 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s15, v8 ; VI-NEXT: v_readfirstlane_b32 s12, v9 @@ -8677,609 +8733,609 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s6, v15 ; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s4, v17 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v18 -; VI-NEXT: v_writelane_b32 v20, s87, 31 -; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: v_writelane_b32 v21, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 58 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 58 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 59 ; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 ; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 ; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 2 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 0 -; VI-NEXT: s_lshr_b32 s66, s27, 8 -; VI-NEXT: s_lshr_b32 s67, s26, 16 -; VI-NEXT: s_lshr_b32 s68, s26, 8 -; VI-NEXT: s_lshr_b32 s69, s25, 24 -; VI-NEXT: s_lshr_b32 s70, s25, 16 -; VI-NEXT: s_lshr_b32 s71, s25, 8 -; VI-NEXT: s_lshr_b32 s80, s24, 16 -; VI-NEXT: s_lshr_b32 s81, s24, 8 -; VI-NEXT: s_lshr_b32 s82, s23, 24 -; VI-NEXT: s_lshr_b32 s83, s23, 16 -; VI-NEXT: s_lshr_b32 s84, s23, 8 -; VI-NEXT: s_lshr_b32 s85, s22, 16 -; VI-NEXT: s_lshr_b32 s86, s22, 8 -; VI-NEXT: s_lshr_b32 s87, s21, 24 -; VI-NEXT: s_lshr_b32 s50, s21, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s20, 8 -; VI-NEXT: s_lshr_b32 s57, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s58, s17, 24 -; VI-NEXT: s_lshr_b32 s59, s17, 16 -; VI-NEXT: s_lshr_b32 s55, s17, 8 -; VI-NEXT: s_lshr_b32 s64, s16, 16 -; VI-NEXT: s_lshr_b32 s65, s16, 8 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: v_writelane_b32 v22, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s25, 8 +; VI-NEXT: s_lshr_b32 s67, s24, 16 +; VI-NEXT: s_lshr_b32 s68, s24, 8 +; VI-NEXT: s_lshr_b32 s69, s41, 24 +; VI-NEXT: s_lshr_b32 s70, s41, 16 +; VI-NEXT: s_lshr_b32 s71, s41, 8 +; VI-NEXT: s_lshr_b32 s80, s40, 16 +; VI-NEXT: s_lshr_b32 s81, s40, 8 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s83, s43, 16 +; VI-NEXT: s_lshr_b32 s84, s43, 8 +; VI-NEXT: s_lshr_b32 s85, s42, 16 +; VI-NEXT: s_lshr_b32 s86, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s45, 24 +; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: s_lshr_b32 s27, s44, 16 +; VI-NEXT: s_lshr_b32 s28, s44, 8 +; VI-NEXT: s_lshr_b32 s29, s47, 24 +; VI-NEXT: s_lshr_b32 s51, s47, 16 +; VI-NEXT: s_lshr_b32 s52, s47, 8 +; VI-NEXT: s_lshr_b32 s53, s46, 16 +; VI-NEXT: s_lshr_b32 s54, s46, 8 +; VI-NEXT: s_lshr_b32 s58, s57, 24 +; VI-NEXT: s_lshr_b32 s59, s57, 16 +; VI-NEXT: s_lshr_b32 s55, s57, 8 +; VI-NEXT: s_lshr_b32 s64, s56, 16 +; VI-NEXT: s_lshr_b32 s65, s56, 8 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s5, 3 -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 ; VI-NEXT: s_add_i32 s4, s4, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s4, 16 ; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 16 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s6, 16 ; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s9, 16 ; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s8, 16 ; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 16 ; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s10, 16 ; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s13, 16 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s12, 16 ; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s15, 16 ; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 58 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 59 -; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 -; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 -; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s14, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s17, 16 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s16, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s19, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s18, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s21, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s20, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s23, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s22, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 58 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: s_add_i32 s57, s57, 3 +; VI-NEXT: s_add_i32 s56, s56, 3 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b32 s66, s27, 8 -; VI-NEXT: s_lshr_b32 s67, s26, 16 -; VI-NEXT: s_lshr_b32 s68, s26, 8 -; VI-NEXT: s_lshr_b32 s69, s25, 24 -; VI-NEXT: s_lshr_b32 s70, s25, 16 -; VI-NEXT: s_lshr_b32 s71, s25, 8 -; VI-NEXT: s_lshr_b32 s80, s24, 16 -; VI-NEXT: s_lshr_b32 s81, s24, 8 -; VI-NEXT: s_lshr_b32 s82, s23, 24 -; VI-NEXT: s_lshr_b32 s83, s23, 16 -; VI-NEXT: s_lshr_b32 s84, s23, 8 -; VI-NEXT: s_lshr_b32 s85, s22, 16 -; VI-NEXT: s_lshr_b32 s86, s22, 8 -; VI-NEXT: s_lshr_b32 s87, s21, 24 -; VI-NEXT: s_lshr_b32 s50, s21, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s20, 8 -; VI-NEXT: s_lshr_b32 s57, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s58, s17, 24 -; VI-NEXT: s_lshr_b32 s59, s17, 16 -; VI-NEXT: s_lshr_b32 s55, s17, 8 -; VI-NEXT: s_lshr_b32 s64, s16, 16 -; VI-NEXT: s_lshr_b32 s65, s16, 8 -; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s25, 8 +; VI-NEXT: s_lshr_b32 s67, s24, 16 +; VI-NEXT: s_lshr_b32 s68, s24, 8 +; VI-NEXT: s_lshr_b32 s69, s41, 24 +; VI-NEXT: s_lshr_b32 s70, s41, 16 +; VI-NEXT: s_lshr_b32 s71, s41, 8 +; VI-NEXT: s_lshr_b32 s80, s40, 16 +; VI-NEXT: s_lshr_b32 s81, s40, 8 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s83, s43, 16 +; VI-NEXT: s_lshr_b32 s84, s43, 8 +; VI-NEXT: s_lshr_b32 s85, s42, 16 +; VI-NEXT: s_lshr_b32 s86, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s45, 24 +; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: s_lshr_b32 s27, s44, 16 +; VI-NEXT: s_lshr_b32 s28, s44, 8 +; VI-NEXT: s_lshr_b32 s29, s47, 24 +; VI-NEXT: s_lshr_b32 s51, s47, 16 +; VI-NEXT: s_lshr_b32 s52, s47, 8 +; VI-NEXT: s_lshr_b32 s53, s46, 16 +; VI-NEXT: s_lshr_b32 s54, s46, 8 +; VI-NEXT: s_lshr_b32 s58, s57, 24 +; VI-NEXT: s_lshr_b32 s59, s57, 16 +; VI-NEXT: s_lshr_b32 s55, s57, 8 +; VI-NEXT: s_lshr_b32 s64, s56, 16 +; VI-NEXT: s_lshr_b32 s65, s56, 8 +; VI-NEXT: v_writelane_b32 v22, s60, 0 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: .LBB13_3: ; %end ; VI-NEXT: s_lshl_b32 s61, s65, 8 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_and_b32 s56, s56, 0xff +; VI-NEXT: s_or_b32 s56, s56, s61 ; VI-NEXT: s_lshl_b32 s61, s48, 8 ; VI-NEXT: s_and_b32 s63, s64, 0xff ; VI-NEXT: s_or_b32 s61, s63, s61 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s56, s56, 0xffff ; VI-NEXT: s_lshl_b32 s61, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s61 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: s_lshl_b32 s17, s55, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_or_b32 s56, s56, s61 +; VI-NEXT: v_mov_b32_e32 v1, s56 +; VI-NEXT: s_and_b32 s56, s57, 0xff +; VI-NEXT: s_lshl_b32 s57, s55, 8 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, s59, 0xff ; VI-NEXT: s_lshl_b32 s58, s58, 8 -; VI-NEXT: s_or_b32 s17, s17, s58 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_lshl_b32 s16, s54, 8 -; VI-NEXT: s_and_b32 s17, s18, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s38, 8 -; VI-NEXT: s_and_b32 s18, s53, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v3, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xff -; VI-NEXT: s_lshl_b32 s17, s52, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s51, 0xff -; VI-NEXT: s_lshl_b32 s18, s57, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: s_lshl_b32 s16, s56, 8 -; VI-NEXT: s_and_b32 s17, s20, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s36, 8 -; VI-NEXT: s_and_b32 s18, s47, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v5, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xff -; VI-NEXT: s_lshl_b32 s17, s46, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s50, 0xff -; VI-NEXT: s_lshl_b32 s18, s87, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v6, s16 -; VI-NEXT: s_lshl_b32 s16, s86, 8 -; VI-NEXT: s_and_b32 s17, s22, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s34, 8 -; VI-NEXT: s_and_b32 s18, s85, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v7, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xff -; VI-NEXT: s_lshl_b32 s17, s84, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s83, 0xff -; VI-NEXT: s_lshl_b32 s18, s82, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: s_lshl_b32 s16, s81, 8 -; VI-NEXT: s_and_b32 s17, s24, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s30, 8 -; VI-NEXT: s_and_b32 s18, s80, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xff -; VI-NEXT: s_lshl_b32 s17, s71, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s70, 0xff -; VI-NEXT: s_lshl_b32 s18, s69, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: s_lshl_b32 s16, s68, 8 -; VI-NEXT: s_and_b32 s17, s26, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s90, 8 -; VI-NEXT: s_and_b32 s18, s67, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v11, s16 -; VI-NEXT: s_and_b32 s16, s27, 0xff -; VI-NEXT: s_lshl_b32 s17, s66, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 59 -; VI-NEXT: v_readlane_b32 s18, v21, 58 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v12, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 57 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s28, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 56 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s88, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 55 -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: s_and_b32 s16, s29, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 54 -; VI-NEXT: v_readlane_b32 s18, v21, 53 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 52 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s44, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 51 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s78, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s56, s56, 0xffff +; VI-NEXT: s_lshl_b32 s57, s57, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: v_mov_b32_e32 v2, s56 +; VI-NEXT: s_lshl_b32 s56, s54, 8 +; VI-NEXT: s_and_b32 s46, s46, 0xff +; VI-NEXT: s_or_b32 s46, s46, s56 +; VI-NEXT: s_lshl_b32 s56, s38, 8 +; VI-NEXT: s_and_b32 s57, s53, 0xff +; VI-NEXT: s_or_b32 s56, s57, s56 +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_lshl_b32 s56, s56, 16 +; VI-NEXT: s_or_b32 s46, s46, s56 +; VI-NEXT: v_mov_b32_e32 v3, s46 +; VI-NEXT: s_and_b32 s46, s47, 0xff +; VI-NEXT: s_lshl_b32 s47, s52, 8 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, s51, 0xff +; VI-NEXT: s_lshl_b32 s29, s29, 8 +; VI-NEXT: s_or_b32 s29, s47, s29 +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_lshl_b32 s29, s29, 16 +; VI-NEXT: s_or_b32 s29, s46, s29 +; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: s_lshl_b32 s28, s28, 8 +; VI-NEXT: s_and_b32 s29, s44, 0xff +; VI-NEXT: s_or_b32 s28, s29, s28 +; VI-NEXT: s_lshl_b32 s29, s36, 8 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_or_b32 s27, s27, s29 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: s_and_b32 s27, s45, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s27, s50, 0xff +; VI-NEXT: s_lshl_b32 s28, s87, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v6, s26 +; VI-NEXT: s_lshl_b32 s26, s86, 8 +; VI-NEXT: s_and_b32 s27, s42, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_lshl_b32 s27, s34, 8 +; VI-NEXT: s_and_b32 s28, s85, 0xff +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: s_and_b32 s26, s43, 0xff +; VI-NEXT: s_lshl_b32 s27, s84, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, s83, 0xff +; VI-NEXT: s_lshl_b32 s28, s82, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v8, s26 +; VI-NEXT: s_lshl_b32 s26, s81, 8 +; VI-NEXT: s_and_b32 s27, s40, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_lshl_b32 s27, s30, 8 +; VI-NEXT: s_and_b32 s28, s80, 0xff +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: s_and_b32 s26, s41, 0xff +; VI-NEXT: s_lshl_b32 s27, s71, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, s70, 0xff +; VI-NEXT: s_lshl_b32 s28, s69, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_lshl_b32 s26, s68, 8 +; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_lshl_b32 s26, s90, 8 +; VI-NEXT: s_and_b32 s27, s67, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: s_and_b32 s24, s25, 0xff +; VI-NEXT: s_lshl_b32 s25, s66, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_readlane_b32 s25, v22, 59 +; VI-NEXT: v_readlane_b32 s26, v22, 58 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s25, s25, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_readlane_b32 s24, v22, 57 +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: v_readlane_b32 s25, v22, 56 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_lshl_b32 s24, s88, 8 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_or_b32 s24, s25, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s24, s24, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: s_and_b32 s22, s23, 0xff +; VI-NEXT: v_readlane_b32 s23, v22, 55 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_readlane_b32 s23, v22, 54 +; VI-NEXT: v_readlane_b32 s24, v22, 53 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s23, s23, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_readlane_b32 s22, v22, 52 +; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: v_readlane_b32 s23, v22, 51 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_lshl_b32 s22, s78, 8 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_or_b32 s22, s23, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: v_readlane_b32 s21, v22, 50 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: s_lshl_b32 s21, s21, 8 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s21, v22, 49 +; VI-NEXT: v_readlane_b32 s22, v22, 48 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: v_readlane_b32 s17, v21, 49 -; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: s_and_b32 s21, s21, 0xff +; VI-NEXT: s_lshl_b32 s22, s22, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s21, s21, s22 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: v_readlane_b32 s20, v22, 47 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s42, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: v_readlane_b32 s20, v22, 46 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: s_lshl_b32 s21, s76, 8 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 ; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: s_or_b32 s18, s18, s20 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s43, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 44 -; VI-NEXT: v_readlane_b32 s18, v21, 43 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: v_readlane_b32 s19, v22, 45 +; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_readlane_b32 s19, v22, 44 +; VI-NEXT: v_readlane_b32 s20, v22, 43 +; VI-NEXT: s_and_b32 s19, s19, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s40, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 41 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s74, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_readlane_b32 s18, v22, 42 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: v_readlane_b32 s18, v22, 41 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: v_readlane_b32 s17, v22, 40 ; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 39 -; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: v_readlane_b32 s17, v22, 39 +; VI-NEXT: v_readlane_b32 s18, v22, 38 ; VI-NEXT: s_and_b32 s17, s17, 0xff ; VI-NEXT: s_lshl_b32 s18, s18, 8 ; VI-NEXT: s_or_b32 s17, s17, s18 @@ -9289,11 +9345,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: v_readlane_b32 s16, v22, 37 ; VI-NEXT: s_and_b32 s14, s14, 0xff ; VI-NEXT: s_lshl_b32 s16, s16, 8 ; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: v_readlane_b32 s16, v22, 36 ; VI-NEXT: s_and_b32 s16, s16, 0xff ; VI-NEXT: s_lshl_b32 s17, s72, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 @@ -9304,11 +9360,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: s_and_b32 s14, s15, 0xff -; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: v_readlane_b32 s15, v22, 35 ; VI-NEXT: s_lshl_b32 s15, s15, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: v_readlane_b32 s15, v21, 34 -; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: v_readlane_b32 s15, v22, 34 +; VI-NEXT: v_readlane_b32 s16, v22, 33 ; VI-NEXT: s_and_b32 s15, s15, 0xff ; VI-NEXT: s_lshl_b32 s16, s16, 8 ; VI-NEXT: s_or_b32 s15, s15, s16 @@ -9318,11 +9374,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: v_readlane_b32 s14, v22, 32 ; VI-NEXT: s_and_b32 s12, s12, 0xff ; VI-NEXT: s_lshl_b32 s14, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: v_readlane_b32 s14, v22, 31 ; VI-NEXT: s_and_b32 s14, s14, 0xff ; VI-NEXT: s_lshl_b32 s15, s62, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 @@ -9333,11 +9389,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 ; VI-NEXT: s_and_b32 s12, s13, 0xff -; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: v_readlane_b32 s13, v22, 30 ; VI-NEXT: s_lshl_b32 s13, s13, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: v_readlane_b32 s13, v21, 29 -; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: v_readlane_b32 s13, v22, 29 +; VI-NEXT: v_readlane_b32 s14, v22, 28 ; VI-NEXT: s_and_b32 s13, s13, 0xff ; VI-NEXT: s_lshl_b32 s14, s14, 8 ; VI-NEXT: s_or_b32 s13, s13, s14 @@ -9347,12 +9403,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: v_readlane_b32 s12, v22, 27 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 26 -; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: v_readlane_b32 s12, v22, 26 +; VI-NEXT: v_readlane_b32 s14, v22, 0 ; VI-NEXT: s_and_b32 s12, s12, 0xff ; VI-NEXT: s_lshl_b32 s13, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 @@ -9363,11 +9419,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_and_b32 s10, s11, 0xff -; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: v_readlane_b32 s11, v22, 25 ; VI-NEXT: s_lshl_b32 s11, s11, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: v_readlane_b32 s11, v21, 24 -; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: v_readlane_b32 s11, v22, 24 +; VI-NEXT: v_readlane_b32 s12, v22, 23 ; VI-NEXT: s_and_b32 s11, s11, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s11, s11, s12 @@ -9377,12 +9433,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: v_readlane_b32 s10, v22, 22 ; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 21 -; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: v_readlane_b32 s10, v22, 21 +; VI-NEXT: v_readlane_b32 s12, v22, 2 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s11, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 @@ -9393,11 +9449,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_and_b32 s8, s9, 0xff -; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: v_readlane_b32 s9, v22, 20 ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: v_readlane_b32 s9, v21, 19 -; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: v_readlane_b32 s9, v22, 19 +; VI-NEXT: v_readlane_b32 s10, v22, 18 ; VI-NEXT: s_and_b32 s9, s9, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 @@ -9407,12 +9463,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: v_readlane_b32 s8, v22, 17 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 16 -; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: v_readlane_b32 s8, v22, 16 +; VI-NEXT: v_readlane_b32 s10, v22, 4 ; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 @@ -9423,11 +9479,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s6, s7, 0xff -; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: v_readlane_b32 s7, v22, 15 ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 14 -; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: v_readlane_b32 s7, v22, 14 +; VI-NEXT: v_readlane_b32 s8, v22, 13 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 @@ -9437,12 +9493,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: v_readlane_b32 s6, v22, 12 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 11 -; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: v_readlane_b32 s6, v22, 11 +; VI-NEXT: v_readlane_b32 s8, v22, 6 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -9453,11 +9509,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: v_readlane_b32 s5, v22, 10 ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v21, 9 -; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 9 +; VI-NEXT: v_readlane_b32 s6, v22, 8 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -9468,46 +9524,46 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s15, v21, 1 -; VI-NEXT: v_readlane_b32 s13, v21, 3 -; VI-NEXT: v_readlane_b32 s11, v21, 5 -; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: v_readlane_b32 s15, v22, 1 +; VI-NEXT: v_readlane_b32 s13, v22, 3 +; VI-NEXT: v_readlane_b32 s11, v22, 5 +; VI-NEXT: v_readlane_b32 s9, v22, 7 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s87, v20, 31 -; VI-NEXT: v_readlane_b32 s86, v20, 30 -; VI-NEXT: v_readlane_b32 s85, v20, 29 -; VI-NEXT: v_readlane_b32 s84, v20, 28 -; VI-NEXT: v_readlane_b32 s83, v20, 27 -; VI-NEXT: v_readlane_b32 s82, v20, 26 -; VI-NEXT: v_readlane_b32 s81, v20, 25 -; VI-NEXT: v_readlane_b32 s80, v20, 24 -; VI-NEXT: v_readlane_b32 s71, v20, 23 -; VI-NEXT: v_readlane_b32 s70, v20, 22 -; VI-NEXT: v_readlane_b32 s69, v20, 21 -; VI-NEXT: v_readlane_b32 s68, v20, 20 -; VI-NEXT: v_readlane_b32 s67, v20, 19 -; VI-NEXT: v_readlane_b32 s66, v20, 18 -; VI-NEXT: v_readlane_b32 s65, v20, 17 -; VI-NEXT: v_readlane_b32 s64, v20, 16 -; VI-NEXT: v_readlane_b32 s55, v20, 15 -; VI-NEXT: v_readlane_b32 s54, v20, 14 -; VI-NEXT: v_readlane_b32 s53, v20, 13 -; VI-NEXT: v_readlane_b32 s52, v20, 12 -; VI-NEXT: v_readlane_b32 s51, v20, 11 -; VI-NEXT: v_readlane_b32 s50, v20, 10 -; VI-NEXT: v_readlane_b32 s49, v20, 9 -; VI-NEXT: v_readlane_b32 s48, v20, 8 -; VI-NEXT: v_readlane_b32 s39, v20, 7 -; VI-NEXT: v_readlane_b32 s38, v20, 6 -; VI-NEXT: v_readlane_b32 s37, v20, 5 -; VI-NEXT: v_readlane_b32 s36, v20, 4 -; VI-NEXT: v_readlane_b32 s35, v20, 3 -; VI-NEXT: v_readlane_b32 s34, v20, 2 -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s87, v21, 31 +; VI-NEXT: v_readlane_b32 s86, v21, 30 +; VI-NEXT: v_readlane_b32 s85, v21, 29 +; VI-NEXT: v_readlane_b32 s84, v21, 28 +; VI-NEXT: v_readlane_b32 s83, v21, 27 +; VI-NEXT: v_readlane_b32 s82, v21, 26 +; VI-NEXT: v_readlane_b32 s81, v21, 25 +; VI-NEXT: v_readlane_b32 s80, v21, 24 +; VI-NEXT: v_readlane_b32 s71, v21, 23 +; VI-NEXT: v_readlane_b32 s70, v21, 22 +; VI-NEXT: v_readlane_b32 s69, v21, 21 +; VI-NEXT: v_readlane_b32 s68, v21, 20 +; VI-NEXT: v_readlane_b32 s67, v21, 19 +; VI-NEXT: v_readlane_b32 s66, v21, 18 +; VI-NEXT: v_readlane_b32 s65, v21, 17 +; VI-NEXT: v_readlane_b32 s64, v21, 16 +; VI-NEXT: v_readlane_b32 s55, v21, 15 +; VI-NEXT: v_readlane_b32 s54, v21, 14 +; VI-NEXT: v_readlane_b32 s53, v21, 13 +; VI-NEXT: v_readlane_b32 s52, v21, 12 +; VI-NEXT: v_readlane_b32 s51, v21, 11 +; VI-NEXT: v_readlane_b32 s50, v21, 10 +; VI-NEXT: v_readlane_b32 s49, v21, 9 +; VI-NEXT: v_readlane_b32 s48, v21, 8 +; VI-NEXT: v_readlane_b32 s39, v21, 7 +; VI-NEXT: v_readlane_b32 s38, v21, 6 +; VI-NEXT: v_readlane_b32 s37, v21, 5 +; VI-NEXT: v_readlane_b32 s36, v21, 4 +; VI-NEXT: v_readlane_b32 s35, v21, 3 +; VI-NEXT: v_readlane_b32 s34, v21, 2 +; VI-NEXT: v_readlane_b32 s31, v21, 1 +; VI-NEXT: v_readlane_b32 s30, v21, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -9525,10 +9581,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr87 ; VI-NEXT: ; implicit-def: $sgpr86 @@ -9657,68 +9713,96 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; kill: killed $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 0 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: v_writelane_b32 v22, s60, 0 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 2 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v32i32_to_v128i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_writelane_b32 v20, s34, 2 -; GFX9-NEXT: v_writelane_b32 v20, s35, 3 -; GFX9-NEXT: v_writelane_b32 v20, s36, 4 -; GFX9-NEXT: v_writelane_b32 v20, s37, 5 -; GFX9-NEXT: v_writelane_b32 v20, s38, 6 -; GFX9-NEXT: v_writelane_b32 v20, s39, 7 -; GFX9-NEXT: v_writelane_b32 v20, s48, 8 -; GFX9-NEXT: v_writelane_b32 v20, s49, 9 -; GFX9-NEXT: v_writelane_b32 v20, s50, 10 -; GFX9-NEXT: v_writelane_b32 v20, s51, 11 -; GFX9-NEXT: v_writelane_b32 v20, s52, 12 -; GFX9-NEXT: v_writelane_b32 v20, s53, 13 -; GFX9-NEXT: v_writelane_b32 v20, s54, 14 -; GFX9-NEXT: v_writelane_b32 v20, s55, 15 -; GFX9-NEXT: v_writelane_b32 v20, s64, 16 -; GFX9-NEXT: v_writelane_b32 v20, s65, 17 -; GFX9-NEXT: v_writelane_b32 v20, s66, 18 -; GFX9-NEXT: v_writelane_b32 v20, s67, 19 -; GFX9-NEXT: v_writelane_b32 v20, s68, 20 -; GFX9-NEXT: v_writelane_b32 v20, s69, 21 -; GFX9-NEXT: v_writelane_b32 v20, s70, 22 -; GFX9-NEXT: v_writelane_b32 v20, s71, 23 -; GFX9-NEXT: v_writelane_b32 v20, s80, 24 -; GFX9-NEXT: v_writelane_b32 v20, s81, 25 -; GFX9-NEXT: v_writelane_b32 v20, s82, 26 -; GFX9-NEXT: v_writelane_b32 v20, s83, 27 -; GFX9-NEXT: v_writelane_b32 v20, s84, 28 -; GFX9-NEXT: v_writelane_b32 v20, s85, 29 -; GFX9-NEXT: v_writelane_b32 v20, s86, 30 -; GFX9-NEXT: v_writelane_b32 v20, s87, 31 -; GFX9-NEXT: v_writelane_b32 v20, s96, 32 -; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s34, 2 +; GFX9-NEXT: v_writelane_b32 v21, s35, 3 +; GFX9-NEXT: v_writelane_b32 v21, s36, 4 +; GFX9-NEXT: v_writelane_b32 v21, s37, 5 +; GFX9-NEXT: v_writelane_b32 v21, s38, 6 +; GFX9-NEXT: v_writelane_b32 v21, s39, 7 +; GFX9-NEXT: v_writelane_b32 v21, s48, 8 +; GFX9-NEXT: v_writelane_b32 v21, s49, 9 +; GFX9-NEXT: v_writelane_b32 v21, s50, 10 +; GFX9-NEXT: v_writelane_b32 v21, s51, 11 +; GFX9-NEXT: v_writelane_b32 v21, s52, 12 +; GFX9-NEXT: v_writelane_b32 v21, s53, 13 +; GFX9-NEXT: v_writelane_b32 v21, s54, 14 +; GFX9-NEXT: v_writelane_b32 v21, s55, 15 +; GFX9-NEXT: v_writelane_b32 v21, s64, 16 +; GFX9-NEXT: v_writelane_b32 v21, s65, 17 +; GFX9-NEXT: v_writelane_b32 v21, s66, 18 +; GFX9-NEXT: v_writelane_b32 v21, s67, 19 +; GFX9-NEXT: v_writelane_b32 v21, s68, 20 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_writelane_b32 v21, s69, 21 +; GFX9-NEXT: v_readfirstlane_b32 s56, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_writelane_b32 v21, s70, 22 +; GFX9-NEXT: v_readfirstlane_b32 s57, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_writelane_b32 v21, s71, 23 +; GFX9-NEXT: v_readfirstlane_b32 s46, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_writelane_b32 v21, s80, 24 +; GFX9-NEXT: v_readfirstlane_b32 s47, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 +; GFX9-NEXT: v_writelane_b32 v21, s81, 25 +; GFX9-NEXT: v_readfirstlane_b32 s44, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_writelane_b32 v21, s82, 26 +; GFX9-NEXT: v_readfirstlane_b32 s45, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s22 +; GFX9-NEXT: v_writelane_b32 v21, s83, 27 +; GFX9-NEXT: v_readfirstlane_b32 s42, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s23 +; GFX9-NEXT: v_writelane_b32 v21, s84, 28 +; GFX9-NEXT: v_readfirstlane_b32 s43, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_writelane_b32 v21, s85, 29 +; GFX9-NEXT: v_readfirstlane_b32 s40, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s25 +; GFX9-NEXT: v_writelane_b32 v21, s86, 30 +; GFX9-NEXT: v_readfirstlane_b32 s41, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_writelane_b32 v21, s87, 31 +; GFX9-NEXT: v_readfirstlane_b32 s24, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_writelane_b32 v21, s96, 32 +; GFX9-NEXT: v_readfirstlane_b32 s25, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_writelane_b32 v21, s97, 33 +; GFX9-NEXT: v_readfirstlane_b32 s22, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_writelane_b32 v20, s98, 34 -; GFX9-NEXT: v_readfirstlane_b32 s44, v1 -; GFX9-NEXT: v_readfirstlane_b32 s45, v2 -; GFX9-NEXT: v_readfirstlane_b32 s42, v3 -; GFX9-NEXT: v_readfirstlane_b32 s43, v4 -; GFX9-NEXT: v_readfirstlane_b32 s40, v5 -; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_writelane_b32 v21, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s23, v20 +; GFX9-NEXT: v_readfirstlane_b32 s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s21, v2 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 ; GFX9-NEXT: v_readfirstlane_b32 s14, v7 ; GFX9-NEXT: v_readfirstlane_b32 s15, v8 ; GFX9-NEXT: v_readfirstlane_b32 s12, v9 @@ -9730,571 +9814,571 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: v_readfirstlane_b32 s6, v15 ; GFX9-NEXT: v_readfirstlane_b32 s7, v16 ; GFX9-NEXT: v_readfirstlane_b32 s4, v17 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v18 -; GFX9-NEXT: v_writelane_b32 v20, s99, 35 -; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: v_writelane_b32 v21, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 -; GFX9-NEXT: s_lshr_b32 s84, s27, 8 -; GFX9-NEXT: s_lshr_b32 s85, s26, 16 -; GFX9-NEXT: s_lshr_b32 s86, s26, 8 -; GFX9-NEXT: s_lshr_b32 s87, s25, 24 -; GFX9-NEXT: s_lshr_b32 s96, s25, 16 -; GFX9-NEXT: s_lshr_b32 s97, s25, 8 -; GFX9-NEXT: s_lshr_b32 s98, s24, 16 -; GFX9-NEXT: s_lshr_b32 s99, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s39, s23, 16 -; GFX9-NEXT: s_lshr_b32 s48, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s50, s22, 8 -; GFX9-NEXT: s_lshr_b32 s51, s21, 24 -; GFX9-NEXT: s_lshr_b32 s52, s21, 16 -; GFX9-NEXT: s_lshr_b32 s53, s21, 8 -; GFX9-NEXT: s_lshr_b32 s54, s20, 16 -; GFX9-NEXT: s_lshr_b32 s55, s20, 8 -; GFX9-NEXT: s_lshr_b32 s64, s19, 24 -; GFX9-NEXT: s_lshr_b32 s65, s19, 16 -; GFX9-NEXT: s_lshr_b32 s66, s19, 8 -; GFX9-NEXT: s_lshr_b32 s67, s18, 16 -; GFX9-NEXT: s_lshr_b32 s68, s18, 8 -; GFX9-NEXT: s_lshr_b32 s69, s17, 24 -; GFX9-NEXT: s_lshr_b32 s70, s17, 16 -; GFX9-NEXT: s_lshr_b32 s71, s17, 8 -; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 50 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s28, 0 +; GFX9-NEXT: s_lshr_b32 s82, s22, 8 +; GFX9-NEXT: s_lshr_b32 s83, s25, 24 +; GFX9-NEXT: s_lshr_b32 s81, s25, 16 +; GFX9-NEXT: s_lshr_b32 s84, s25, 8 +; GFX9-NEXT: s_lshr_b32 s85, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s97, s41, 8 +; GFX9-NEXT: s_lshr_b32 s98, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s48, s43, 8 +; GFX9-NEXT: s_lshr_b32 s49, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s53, s45, 8 +; GFX9-NEXT: s_lshr_b32 s54, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s66, s47, 8 +; GFX9-NEXT: s_lshr_b32 s67, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 +; GFX9-NEXT: s_lshr_b32 s80, s56, 16 +; GFX9-NEXT: s_lshr_b32 s26, s56, 8 +; GFX9-NEXT: v_writelane_b32 v22, s29, 1 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s5, s5, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 ; GFX9-NEXT: s_add_i32 s4, s4, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 ; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 ; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 ; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 ; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 ; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 ; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 ; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: s_add_i32 s41, s41, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: s_add_i32 s43, s43, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: s_add_i32 s42, s42, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: s_add_i32 s45, s45, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_add_i32 s44, s44, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 ; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 ; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 ; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 ; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 ; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 50 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; GFX9-NEXT: s_add_i32 s57, s57, 3 +; GFX9-NEXT: s_add_i32 s56, s56, 3 +; GFX9-NEXT: s_add_i32 s47, s47, 3 +; GFX9-NEXT: s_add_i32 s46, s46, 3 +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 ; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 -; GFX9-NEXT: s_lshr_b32 s84, s27, 8 -; GFX9-NEXT: s_lshr_b32 s85, s26, 16 -; GFX9-NEXT: s_lshr_b32 s86, s26, 8 -; GFX9-NEXT: s_lshr_b32 s87, s25, 24 -; GFX9-NEXT: s_lshr_b32 s96, s25, 16 -; GFX9-NEXT: s_lshr_b32 s97, s25, 8 -; GFX9-NEXT: s_lshr_b32 s98, s24, 16 -; GFX9-NEXT: s_lshr_b32 s99, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s39, s23, 16 -; GFX9-NEXT: s_lshr_b32 s48, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s50, s22, 8 -; GFX9-NEXT: s_lshr_b32 s51, s21, 24 -; GFX9-NEXT: s_lshr_b32 s52, s21, 16 -; GFX9-NEXT: s_lshr_b32 s53, s21, 8 -; GFX9-NEXT: s_lshr_b32 s54, s20, 16 -; GFX9-NEXT: s_lshr_b32 s55, s20, 8 -; GFX9-NEXT: s_lshr_b32 s64, s19, 24 -; GFX9-NEXT: s_lshr_b32 s65, s19, 16 -; GFX9-NEXT: s_lshr_b32 s66, s19, 8 -; GFX9-NEXT: s_lshr_b32 s67, s18, 16 -; GFX9-NEXT: s_lshr_b32 s68, s18, 8 -; GFX9-NEXT: s_lshr_b32 s69, s17, 24 -; GFX9-NEXT: s_lshr_b32 s70, s17, 16 -; GFX9-NEXT: s_lshr_b32 s71, s17, 8 -; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: v_writelane_b32 v22, s28, 0 +; GFX9-NEXT: s_lshr_b32 s82, s22, 8 +; GFX9-NEXT: s_lshr_b32 s83, s25, 24 +; GFX9-NEXT: s_lshr_b32 s81, s25, 16 +; GFX9-NEXT: s_lshr_b32 s84, s25, 8 +; GFX9-NEXT: s_lshr_b32 s85, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s97, s41, 8 +; GFX9-NEXT: s_lshr_b32 s98, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s48, s43, 8 +; GFX9-NEXT: s_lshr_b32 s49, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s53, s45, 8 +; GFX9-NEXT: s_lshr_b32 s54, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s66, s47, 8 +; GFX9-NEXT: s_lshr_b32 s67, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 +; GFX9-NEXT: s_lshr_b32 s80, s56, 16 +; GFX9-NEXT: s_lshr_b32 s26, s56, 8 +; GFX9-NEXT: v_writelane_b32 v22, s29, 1 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_lshl_b32 s26, s26, 8 +; GFX9-NEXT: s_and_b32 s27, s56, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s36, 8 +; GFX9-NEXT: s_and_b32 s29, s80, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-NEXT: s_and_b32 s26, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s71, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s69, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: s_lshl_b32 s26, s68, 8 +; GFX9-NEXT: s_and_b32 s27, s46, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s34, 8 +; GFX9-NEXT: s_and_b32 s29, s67, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-NEXT: s_and_b32 s26, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s66, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s64, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s26 +; GFX9-NEXT: s_lshl_b32 s26, s55, 8 +; GFX9-NEXT: s_and_b32 s27, s44, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s30, 8 +; GFX9-NEXT: s_and_b32 s29, s54, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: s_and_b32 s26, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s53, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s51, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v6, s26 +; GFX9-NEXT: s_lshl_b32 s26, s50, 8 +; GFX9-NEXT: s_and_b32 s27, s42, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s94, 8 +; GFX9-NEXT: s_and_b32 s29, s49, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v7, s26 +; GFX9-NEXT: s_and_b32 s26, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s48, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s38, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v8, s26 +; GFX9-NEXT: s_lshl_b32 s26, s99, 8 +; GFX9-NEXT: s_and_b32 s27, s40, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s92, 8 +; GFX9-NEXT: s_and_b32 s29, s98, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v9, s26 +; GFX9-NEXT: s_and_b32 s26, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s97, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s87, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_lshl_b32 s26, s86, 8 +; GFX9-NEXT: s_and_b32 s24, s24, 0xff +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: s_lshl_b32 s26, s90, 8 +; GFX9-NEXT: s_and_b32 s27, s85, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s26, s26, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: s_and_b32 s24, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s25, s84, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: s_and_b32 s25, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s26, s83, 8 +; GFX9-NEXT: s_or_b32 s25, s25, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s25, s25, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: s_lshl_b32 s24, s82, 8 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: v_readlane_b32 s25, v22, 50 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: s_lshl_b32 s24, s88, 8 +; GFX9-NEXT: s_and_b32 s25, s25, 0xff +; GFX9-NEXT: s_or_b32 s24, s25, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: s_and_b32 s22, s23, 0xff +; GFX9-NEXT: v_readlane_b32 s23, v22, 49 +; GFX9-NEXT: s_lshl_b32 s23, s23, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: v_readlane_b32 s23, v22, 48 +; GFX9-NEXT: v_readlane_b32 s24, v22, 47 +; GFX9-NEXT: s_and_b32 s23, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s24, 8 +; GFX9-NEXT: s_or_b32 s23, s23, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_readlane_b32 s22, v22, 46 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s22 +; GFX9-NEXT: v_readlane_b32 s22, v22, 45 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s78, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s22, s22, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s22 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: s_and_b32 s20, s21, 0xff +; GFX9-NEXT: v_readlane_b32 s21, v22, 44 +; GFX9-NEXT: s_lshl_b32 s21, s21, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: v_readlane_b32 s21, v22, 43 +; GFX9-NEXT: v_readlane_b32 s22, v22, 42 +; GFX9-NEXT: s_and_b32 s21, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s22 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_readlane_b32 s20, v22, 41 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s20 +; GFX9-NEXT: v_readlane_b32 s20, v22, 40 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s76, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s20, s20, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s20 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: v_readlane_b32 s19, v22, 39 +; GFX9-NEXT: s_lshl_b32 s19, s19, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: v_readlane_b32 s19, v22, 38 +; GFX9-NEXT: v_readlane_b32 s20, v22, 37 +; GFX9-NEXT: s_and_b32 s19, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s20 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_readlane_b32 s18, v22, 36 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: s_lshl_b32 s46, s36, 8 -; GFX9-NEXT: s_and_b32 s47, s80, 0xff -; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: v_readlane_b32 s18, v22, 35 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s74, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s46, s46, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s71, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s46, s69, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s46 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_lshl_b32 s16, s68, 8 -; GFX9-NEXT: s_and_b32 s17, s18, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s34, 8 -; GFX9-NEXT: s_and_b32 s18, s67, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: s_and_b32 s16, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s66, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s65, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s64, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshl_b32 s16, s55, 8 -; GFX9-NEXT: s_and_b32 s17, s20, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s30, 8 -; GFX9-NEXT: s_and_b32 s18, s54, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-NEXT: s_and_b32 s16, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s53, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s51, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: s_lshl_b32 s16, s50, 8 -; GFX9-NEXT: s_and_b32 s17, s22, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s94, 8 -; GFX9-NEXT: s_and_b32 s18, s49, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v7, s16 -; GFX9-NEXT: s_and_b32 s16, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s48, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s38, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_lshl_b32 s16, s99, 8 -; GFX9-NEXT: s_and_b32 s17, s24, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s92, 8 -; GFX9-NEXT: s_and_b32 s18, s98, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: s_and_b32 s16, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s97, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s96, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s87, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-NEXT: s_lshl_b32 s16, s86, 8 -; GFX9-NEXT: s_and_b32 s17, s26, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s90, 8 -; GFX9-NEXT: s_and_b32 s18, s85, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v11, s16 -; GFX9-NEXT: s_and_b32 s16, s27, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s84, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s83, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-NEXT: s_lshl_b32 s16, s82, 8 -; GFX9-NEXT: s_and_b32 s17, s28, 0xff -; GFX9-NEXT: v_readlane_b32 s18, v21, 50 -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s88, 8 -; GFX9-NEXT: s_and_b32 s18, s18, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 49 -; GFX9-NEXT: v_mov_b32_e32 v13, s16 -; GFX9-NEXT: s_and_b32 s16, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 48 -; GFX9-NEXT: v_readlane_b32 s18, v21, 47 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 46 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s44, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 45 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s78, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 44 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 43 -; GFX9-NEXT: v_readlane_b32 s18, v21, 42 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 41 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s42, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 40 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s76, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 39 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 38 -; GFX9-NEXT: v_readlane_b32 s18, v21, 37 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 36 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s40, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 35 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s74, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: v_readlane_b32 s17, v22, 34 ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 33 -; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: v_readlane_b32 s17, v22, 33 +; GFX9-NEXT: v_readlane_b32 s18, v22, 32 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -10303,11 +10387,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: v_readlane_b32 s16, v22, 31 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 30 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s72, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -10317,11 +10401,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: v_readlane_b32 s15, v22, 29 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v21, 28 -; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: v_readlane_b32 s15, v22, 28 +; GFX9-NEXT: v_readlane_b32 s16, v22, 27 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -10330,11 +10414,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: v_readlane_b32 s14, v22, 26 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 25 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s15, s62, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 @@ -10344,11 +10428,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: v_readlane_b32 s13, v22, 24 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v21, 23 -; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: v_readlane_b32 s13, v22, 23 +; GFX9-NEXT: v_readlane_b32 s14, v22, 22 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -10357,11 +10441,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: v_readlane_b32 s12, v22, 21 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 20 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s13, s60, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 @@ -10371,11 +10455,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: v_readlane_b32 s11, v22, 19 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v21, 18 -; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: v_readlane_b32 s11, v22, 18 +; GFX9-NEXT: v_readlane_b32 s12, v22, 17 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -10384,11 +10468,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: v_readlane_b32 s10, v22, 16 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: v_readlane_b32 s10, v22, 15 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s58, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 @@ -10398,11 +10482,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: v_readlane_b32 s9, v22, 14 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v21, 13 -; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: v_readlane_b32 s9, v22, 13 +; GFX9-NEXT: v_readlane_b32 s10, v22, 12 ; GFX9-NEXT: s_and_b32 s9, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 @@ -10411,13 +10495,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: v_readlane_b32 s8, v22, 11 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: v_readlane_b32 s8, v22, 10 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_lshl_b32 s9, s28, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 @@ -10425,11 +10509,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: v_readlane_b32 s7, v22, 9 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v21, 8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: v_readlane_b32 s7, v22, 8 +; GFX9-NEXT: v_readlane_b32 s8, v22, 7 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 @@ -10438,12 +10522,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: v_readlane_b32 s6, v22, 6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s6, v22, 5 +; GFX9-NEXT: v_readlane_b32 s8, v22, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -10453,11 +10537,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: v_readlane_b32 s5, v22, 4 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v21, 3 -; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: v_readlane_b32 s5, v22, 3 +; GFX9-NEXT: v_readlane_b32 s6, v22, 2 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -10466,61 +10550,61 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: v_readlane_b32 s9, v22, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: v_readlane_b32 s99, v20, 35 -; GFX9-NEXT: v_readlane_b32 s98, v20, 34 -; GFX9-NEXT: v_readlane_b32 s97, v20, 33 -; GFX9-NEXT: v_readlane_b32 s96, v20, 32 -; GFX9-NEXT: v_readlane_b32 s87, v20, 31 -; GFX9-NEXT: v_readlane_b32 s86, v20, 30 -; GFX9-NEXT: v_readlane_b32 s85, v20, 29 -; GFX9-NEXT: v_readlane_b32 s84, v20, 28 -; GFX9-NEXT: v_readlane_b32 s83, v20, 27 -; GFX9-NEXT: v_readlane_b32 s82, v20, 26 -; GFX9-NEXT: v_readlane_b32 s81, v20, 25 -; GFX9-NEXT: v_readlane_b32 s80, v20, 24 -; GFX9-NEXT: v_readlane_b32 s71, v20, 23 -; GFX9-NEXT: v_readlane_b32 s70, v20, 22 -; GFX9-NEXT: v_readlane_b32 s69, v20, 21 -; GFX9-NEXT: v_readlane_b32 s68, v20, 20 -; GFX9-NEXT: v_readlane_b32 s67, v20, 19 -; GFX9-NEXT: v_readlane_b32 s66, v20, 18 -; GFX9-NEXT: v_readlane_b32 s65, v20, 17 -; GFX9-NEXT: v_readlane_b32 s64, v20, 16 -; GFX9-NEXT: v_readlane_b32 s55, v20, 15 -; GFX9-NEXT: v_readlane_b32 s54, v20, 14 -; GFX9-NEXT: v_readlane_b32 s53, v20, 13 -; GFX9-NEXT: v_readlane_b32 s52, v20, 12 -; GFX9-NEXT: v_readlane_b32 s51, v20, 11 -; GFX9-NEXT: v_readlane_b32 s50, v20, 10 -; GFX9-NEXT: v_readlane_b32 s49, v20, 9 -; GFX9-NEXT: v_readlane_b32 s48, v20, 8 -; GFX9-NEXT: v_readlane_b32 s39, v20, 7 -; GFX9-NEXT: v_readlane_b32 s38, v20, 6 -; GFX9-NEXT: v_readlane_b32 s37, v20, 5 -; GFX9-NEXT: v_readlane_b32 s36, v20, 4 -; GFX9-NEXT: v_readlane_b32 s35, v20, 3 -; GFX9-NEXT: v_readlane_b32 s34, v20, 2 -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s99, v21, 35 +; GFX9-NEXT: v_readlane_b32 s98, v21, 34 +; GFX9-NEXT: v_readlane_b32 s97, v21, 33 +; GFX9-NEXT: v_readlane_b32 s96, v21, 32 +; GFX9-NEXT: v_readlane_b32 s87, v21, 31 +; GFX9-NEXT: v_readlane_b32 s86, v21, 30 +; GFX9-NEXT: v_readlane_b32 s85, v21, 29 +; GFX9-NEXT: v_readlane_b32 s84, v21, 28 +; GFX9-NEXT: v_readlane_b32 s83, v21, 27 +; GFX9-NEXT: v_readlane_b32 s82, v21, 26 +; GFX9-NEXT: v_readlane_b32 s81, v21, 25 +; GFX9-NEXT: v_readlane_b32 s80, v21, 24 +; GFX9-NEXT: v_readlane_b32 s71, v21, 23 +; GFX9-NEXT: v_readlane_b32 s70, v21, 22 +; GFX9-NEXT: v_readlane_b32 s69, v21, 21 +; GFX9-NEXT: v_readlane_b32 s68, v21, 20 +; GFX9-NEXT: v_readlane_b32 s67, v21, 19 +; GFX9-NEXT: v_readlane_b32 s66, v21, 18 +; GFX9-NEXT: v_readlane_b32 s65, v21, 17 +; GFX9-NEXT: v_readlane_b32 s64, v21, 16 +; GFX9-NEXT: v_readlane_b32 s55, v21, 15 +; GFX9-NEXT: v_readlane_b32 s54, v21, 14 +; GFX9-NEXT: v_readlane_b32 s53, v21, 13 +; GFX9-NEXT: v_readlane_b32 s52, v21, 12 +; GFX9-NEXT: v_readlane_b32 s51, v21, 11 +; GFX9-NEXT: v_readlane_b32 s50, v21, 10 +; GFX9-NEXT: v_readlane_b32 s49, v21, 9 +; GFX9-NEXT: v_readlane_b32 s48, v21, 8 +; GFX9-NEXT: v_readlane_b32 s39, v21, 7 +; GFX9-NEXT: v_readlane_b32 s38, v21, 6 +; GFX9-NEXT: v_readlane_b32 s37, v21, 5 +; GFX9-NEXT: v_readlane_b32 s36, v21, 4 +; GFX9-NEXT: v_readlane_b32 s35, v21, 3 +; GFX9-NEXT: v_readlane_b32 s34, v21, 2 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: v_writelane_b32 v21, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: v_writelane_b32 v22, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr70 @@ -10563,101 +10647,101 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: v_writelane_b32 v21, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: v_writelane_b32 v22, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v32i32_to_v128i8_scalar: @@ -10665,213 +10749,240 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v16, s32 -; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v34, s32 +; GFX11-NEXT: scratch_store_b32 off, v35, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v36, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v37, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v16, s30, 0 -; GFX11-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-NEXT: v_writelane_b32 v34, s30, 0 +; GFX11-NEXT: v_writelane_b32 v35, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-NEXT: v_writelane_b32 v34, s31, 1 +; GFX11-NEXT: v_writelane_b32 v35, s97, 1 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_writelane_b32 v34, s34, 2 +; GFX11-NEXT: v_writelane_b32 v35, s98, 2 +; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21 +; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23 +; GFX11-NEXT: v_writelane_b32 v34, s35, 3 +; GFX11-NEXT: v_writelane_b32 v35, s99, 3 +; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25 +; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27 +; GFX11-NEXT: v_writelane_b32 v34, s36, 4 +; GFX11-NEXT: v_writelane_b32 v35, s100, 4 +; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s40, v1 -; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v16, s31, 1 -; GFX11-NEXT: v_writelane_b32 v17, s97, 1 -; GFX11-NEXT: v_readfirstlane_b32 s14, v3 -; GFX11-NEXT: v_readfirstlane_b32 s15, v4 -; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v16, s34, 2 -; GFX11-NEXT: v_writelane_b32 v17, s98, 2 -; GFX11-NEXT: v_readfirstlane_b32 s13, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 -; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v16, s35, 3 -; GFX11-NEXT: v_writelane_b32 v17, s99, 3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v9 -; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v16, s36, 4 -; GFX11-NEXT: v_writelane_b32 v17, s100, 4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v13 -; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v16, s37, 5 -; GFX11-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-NEXT: v_writelane_b32 v34, s37, 5 +; GFX11-NEXT: v_writelane_b32 v35, s101, 5 +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_writelane_b32 v34, s38, 6 +; GFX11-NEXT: v_writelane_b32 v35, s102, 6 +; GFX11-NEXT: v_readfirstlane_b32 s29, v19 +; GFX11-NEXT: v_readfirstlane_b32 s26, v20 +; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_writelane_b32 v34, s39, 7 +; GFX11-NEXT: v_writelane_b32 v35, s103, 7 +; GFX11-NEXT: v_readfirstlane_b32 s24, v22 +; GFX11-NEXT: v_readfirstlane_b32 s25, v23 +; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_writelane_b32 v34, s48, 8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s20, v26 +; GFX11-NEXT: v_readfirstlane_b32 s21, v27 +; GFX11-NEXT: v_readfirstlane_b32 s18, v28 +; GFX11-NEXT: v_writelane_b32 v34, s49, 9 +; GFX11-NEXT: v_readfirstlane_b32 s19, v29 +; GFX11-NEXT: v_readfirstlane_b32 s16, v30 +; GFX11-NEXT: v_readfirstlane_b32 s17, v31 +; GFX11-NEXT: v_readfirstlane_b32 s14, v32 +; GFX11-NEXT: v_writelane_b32 v34, s50, 10 +; GFX11-NEXT: v_readfirstlane_b32 s15, v33 +; GFX11-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-NEXT: v_writelane_b32 v34, s51, 11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_writelane_b32 v34, s52, 12 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_writelane_b32 v34, s53, 13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 ; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: v_writelane_b32 v34, s54, 14 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v16, s38, 6 -; GFX11-NEXT: v_writelane_b32 v17, s102, 6 -; GFX11-NEXT: v_writelane_b32 v16, s39, 7 -; GFX11-NEXT: v_writelane_b32 v17, s103, 7 -; GFX11-NEXT: v_writelane_b32 v16, s48, 8 -; GFX11-NEXT: v_writelane_b32 v17, s104, 8 -; GFX11-NEXT: v_writelane_b32 v16, s49, 9 -; GFX11-NEXT: v_writelane_b32 v16, s50, 10 -; GFX11-NEXT: v_writelane_b32 v16, s51, 11 -; GFX11-NEXT: v_writelane_b32 v16, s52, 12 -; GFX11-NEXT: v_writelane_b32 v16, s53, 13 -; GFX11-NEXT: v_writelane_b32 v16, s54, 14 -; GFX11-NEXT: v_writelane_b32 v16, s55, 15 -; GFX11-NEXT: v_writelane_b32 v16, s64, 16 -; GFX11-NEXT: v_writelane_b32 v16, s65, 17 -; GFX11-NEXT: v_writelane_b32 v16, s66, 18 -; GFX11-NEXT: v_writelane_b32 v16, s67, 19 -; GFX11-NEXT: v_writelane_b32 v16, s68, 20 -; GFX11-NEXT: v_writelane_b32 v16, s69, 21 -; GFX11-NEXT: v_writelane_b32 v16, s70, 22 -; GFX11-NEXT: v_writelane_b32 v16, s71, 23 -; GFX11-NEXT: v_writelane_b32 v16, s80, 24 -; GFX11-NEXT: v_writelane_b32 v16, s81, 25 -; GFX11-NEXT: v_writelane_b32 v16, s82, 26 -; GFX11-NEXT: v_writelane_b32 v16, s83, 27 -; GFX11-NEXT: v_writelane_b32 v16, s84, 28 -; GFX11-NEXT: v_writelane_b32 v16, s85, 29 -; GFX11-NEXT: v_writelane_b32 v16, s86, 30 -; GFX11-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-NEXT: v_writelane_b32 v35, s104, 8 +; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v34, s55, 15 +; GFX11-NEXT: v_writelane_b32 v34, s64, 16 +; GFX11-NEXT: v_writelane_b32 v34, s65, 17 +; GFX11-NEXT: v_writelane_b32 v34, s66, 18 +; GFX11-NEXT: v_writelane_b32 v34, s67, 19 +; GFX11-NEXT: v_writelane_b32 v34, s68, 20 +; GFX11-NEXT: v_writelane_b32 v34, s69, 21 +; GFX11-NEXT: v_writelane_b32 v34, s70, 22 +; GFX11-NEXT: v_writelane_b32 v34, s71, 23 +; GFX11-NEXT: v_writelane_b32 v34, s80, 24 +; GFX11-NEXT: v_writelane_b32 v34, s81, 25 +; GFX11-NEXT: v_writelane_b32 v34, s82, 26 +; GFX11-NEXT: v_writelane_b32 v34, s83, 27 +; GFX11-NEXT: v_writelane_b32 v34, s84, 28 +; GFX11-NEXT: v_writelane_b32 v34, s85, 29 +; GFX11-NEXT: v_writelane_b32 v34, s86, 30 +; GFX11-NEXT: v_writelane_b32 v34, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s104, s1, 24 +; GFX11-NEXT: s_lshr_b32 s102, s1, 16 +; GFX11-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 17 +; GFX11-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-NEXT: s_lshr_b32 s47, s0, 8 +; GFX11-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 18 ; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 +; GFX11-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-NEXT: s_lshr_b32 s69, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 19 ; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-NEXT: s_lshr_b32 s56, s2, 8 +; GFX11-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 20 ; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 21 ; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 22 ; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 24 +; GFX11-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 25 +; GFX11-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s58, s10, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 26 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 27 +; GFX11-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 8 +; GFX11-NEXT: s_lshr_b32 s81, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 28 +; GFX11-NEXT: s_lshr_b32 s43, s25, 24 +; GFX11-NEXT: s_lshr_b32 s82, s15, 16 +; GFX11-NEXT: s_lshr_b32 s83, s15, 8 +; GFX11-NEXT: s_lshr_b32 s84, s14, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 29 +; GFX11-NEXT: s_lshr_b32 s43, s25, 16 +; GFX11-NEXT: s_lshr_b32 s85, s14, 8 +; GFX11-NEXT: s_lshr_b32 s86, s17, 24 +; GFX11-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 30 +; GFX11-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 31 +; GFX11-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-NEXT: s_lshr_b32 s97, s19, 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 0 +; GFX11-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 14 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s74, s28, 16 +; GFX11-NEXT: v_writelane_b32 v36, s43, 1 +; GFX11-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-NEXT: v_writelane_b32 v37, s63, 15 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s98, s41, 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 12 +; GFX11-NEXT: s_lshr_b32 s99, s41, 16 +; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: v_writelane_b32 v36, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s27, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: v_writelane_b32 v36, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 10 +; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s26, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 11 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 8 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s63, 9 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 8 +; GFX11-NEXT: s_lshr_b32 s43, s29, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 6 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 9 +; GFX11-NEXT: s_lshr_b32 s43, s28, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 7 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s62, 4 +; GFX11-NEXT: v_writelane_b32 v37, s63, 5 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-NEXT: v_writelane_b32 v37, s62, 2 +; GFX11-NEXT: v_writelane_b32 v37, s63, 3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 0 +; GFX11-NEXT: v_writelane_b32 v37, s63, 1 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; GFX11-NEXT: ; implicit-def: $vcc_hi @@ -10879,7 +10990,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -10889,7 +11000,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -10901,7 +11012,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -10912,7 +11023,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -10924,7 +11035,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -10935,7 +11046,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 @@ -10999,20 +11110,20 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB13_3: ; %Flow ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 ; GFX11-NEXT: s_mov_b32 s101, s104 @@ -11021,587 +11132,588 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 s69, s42 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 ; GFX11-NEXT: ; %bb.4: ; %cmp.true -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_lshr_b32 s42, s19, 8 ; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s18, 16 ; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_add_i32 s5, s5, 3 ; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 ; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_add_i32 s24, s24, 3 ; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s20, 8 ; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 ; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s29, s29, 3 ; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s16, s16, 3 ; GFX11-NEXT: s_add_i32 s41, s41, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_add_i32 s40, s40, 3 -; GFX11-NEXT: s_add_i32 s3, s3, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s1, s1, 3 -; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: s_add_i32 s2, s2, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: v_writelane_b32 v37, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-NEXT: s_lshr_b32 s102, s1, 16 +; GFX11-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-NEXT: s_lshr_b32 s47, s0, 8 +; GFX11-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 +; GFX11-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-NEXT: s_lshr_b32 s56, s2, 8 +; GFX11-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 14 +; GFX11-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-NEXT: v_writelane_b32 v37, s63, 15 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 12 +; GFX11-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 10 +; GFX11-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 11 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 8 +; GFX11-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s63, 9 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 8 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 6 +; GFX11-NEXT: s_lshr_b32 s58, s10, 8 +; GFX11-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 7 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 8 +; GFX11-NEXT: s_lshr_b32 s81, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 4 +; GFX11-NEXT: s_lshr_b32 s82, s15, 16 +; GFX11-NEXT: s_lshr_b32 s83, s15, 8 +; GFX11-NEXT: s_lshr_b32 s84, s14, 16 +; GFX11-NEXT: s_lshr_b32 s85, s14, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 5 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-NEXT: s_lshr_b32 s86, s17, 24 +; GFX11-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 2 +; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-NEXT: s_lshr_b32 s97, s19, 24 +; GFX11-NEXT: s_lshr_b32 s69, s19, 16 +; GFX11-NEXT: v_writelane_b32 v37, s63, 3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-NEXT: s_lshr_b32 s74, s28, 16 +; GFX11-NEXT: s_lshr_b32 s43, s28, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 0 +; GFX11-NEXT: s_lshr_b32 s98, s41, 24 +; GFX11-NEXT: s_lshr_b32 s99, s41, 16 +; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 1 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 9 ; GFX11-NEXT: .LBB13_5: ; %end ; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s28, s28, 0xff ; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_or_b32 s28, s28, s43 ; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s28, s28, 0xffff +; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-NEXT: v_readlane_b32 s43, v36, 7 +; GFX11-NEXT: s_or_b32 s28, s28, s42 +; GFX11-NEXT: v_readlane_b32 s42, v36, 9 +; GFX11-NEXT: s_and_b32 s26, s26, 0xff +; GFX11-NEXT: s_and_b32 s27, s27, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_and_b32 s24, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: s_or_b32 s29, s29, s42 +; GFX11-NEXT: v_readlane_b32 s42, v36, 8 +; GFX11-NEXT: s_and_b32 s29, s29, 0xffff +; GFX11-NEXT: s_and_b32 s22, s22, 0xff +; GFX11-NEXT: s_and_b32 s23, s23, 0xff +; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_and_b32 s21, s21, 0xff ; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_lshl_b32 s45, s45, 8 ; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 +; GFX11-NEXT: s_and_b32 s40, s40, 0xff +; GFX11-NEXT: s_or_b32 s29, s29, s42 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29 +; GFX11-NEXT: v_readlane_b32 s28, v36, 6 +; GFX11-NEXT: v_readlane_b32 s29, v36, 5 +; GFX11-NEXT: s_or_b32 s40, s40, s45 ; GFX11-NEXT: s_lshl_b32 s45, s30, 8 ; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: s_or_b32 s26, s26, s28 +; GFX11-NEXT: s_lshl_b32 s28, s92, 8 +; GFX11-NEXT: s_and_b32 s26, s26, 0xffff +; GFX11-NEXT: s_or_b32 s28, s29, s28 +; GFX11-NEXT: v_readlane_b32 s29, v36, 2 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s26, s26, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 4 +; GFX11-NEXT: s_lshl_b32 s29, s29, 8 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_and_b32 s40, s40, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 -; GFX11-NEXT: s_or_b32 s0, s0, s44 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_or_b32 s40, s40, s44 +; GFX11-NEXT: s_or_b32 s27, s27, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 3 +; GFX11-NEXT: s_and_b32 s27, s27, 0xffff +; GFX11-NEXT: s_and_b32 s41, s41, 0xff ; GFX11-NEXT: s_lshl_b32 s44, s100, 8 ; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s44 +; GFX11-NEXT: s_and_b32 s28, s28, 0xff +; GFX11-NEXT: s_or_b32 s41, s41, s44 +; GFX11-NEXT: s_or_b32 s28, s28, s29 +; GFX11-NEXT: v_readlane_b32 s29, v36, 0 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 ; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_or_b32 s27, s27, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 1 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: v_dual_mov_b32 v5, s26 :: v_dual_mov_b32 v6, s27 +; GFX11-NEXT: v_readlane_b32 s26, v37, 19 +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_or_b32 s24, s24, s28 +; GFX11-NEXT: s_lshl_b32 s28, s90, 8 +; GFX11-NEXT: s_and_b32 s24, s24, 0xffff +; GFX11-NEXT: s_or_b32 s28, s29, s28 +; GFX11-NEXT: v_readlane_b32 s29, v37, 29 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_lshl_b32 s26, s26, 8 +; GFX11-NEXT: s_or_b32 s24, s24, s28 +; GFX11-NEXT: v_readlane_b32 s28, v37, 31 +; GFX11-NEXT: s_lshl_b32 s29, s29, 8 +; GFX11-NEXT: s_and_b32 s19, s19, 0xff +; GFX11-NEXT: s_and_b32 s41, s41, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s90, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_or_b32 s41, s41, s44 +; GFX11-NEXT: s_or_b32 s25, s25, s28 +; GFX11-NEXT: v_readlane_b32 s28, v37, 30 +; GFX11-NEXT: s_and_b32 s25, s25, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 ; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 -; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 -; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: s_and_b32 s28, s28, 0xff +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_or_b32 s28, s28, s29 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_or_b32 s25, s25, s28 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: v_dual_mov_b32 v7, s24 :: v_dual_mov_b32 v8, s25 +; GFX11-NEXT: v_readlane_b32 s24, v37, 28 +; GFX11-NEXT: v_readlane_b32 s25, v37, 27 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: s_lshl_b32 s24, s78, 8 +; GFX11-NEXT: s_and_b32 s22, s22, 0xffff +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: v_readlane_b32 s25, v37, 24 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 26 +; GFX11-NEXT: s_lshl_b32 s25, s25, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_or_b32 s23, s23, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 25 +; GFX11-NEXT: s_and_b32 s23, s23, 0xffff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s24, s24, 0xff +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_or_b32 s24, s24, s25 +; GFX11-NEXT: v_readlane_b32 s25, v37, 22 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s22, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s23, s23, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 23 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s23 +; GFX11-NEXT: s_lshl_b32 s22, s88, 8 +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_lshl_b32 s23, s97, 8 +; GFX11-NEXT: s_or_b32 s20, s20, s24 +; GFX11-NEXT: s_lshl_b32 s24, s62, 8 +; GFX11-NEXT: s_and_b32 s20, s20, 0xffff +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: v_readlane_b32 s25, v37, 21 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s20, s20, s24 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s25, s25, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s21, s21, s25 +; GFX11-NEXT: v_readlane_b32 s25, v37, 20 +; GFX11-NEXT: s_and_b32 s21, s21, 0xffff +; GFX11-NEXT: v_readlane_b32 s100, v35, 4 +; GFX11-NEXT: v_readlane_b32 s99, v35, 3 +; GFX11-NEXT: v_readlane_b32 s98, v35, 2 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: v_readlane_b32 s97, v35, 1 +; GFX11-NEXT: s_or_b32 s25, s25, s26 +; GFX11-NEXT: v_readlane_b32 s31, v34, 1 +; GFX11-NEXT: s_lshl_b32 s24, s25, 16 +; GFX11-NEXT: v_readlane_b32 s30, v34, 0 +; GFX11-NEXT: s_or_b32 s21, s21, s24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: s_and_b32 s16, s23, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 -; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_readlane_b32 s20, v37, 18 +; GFX11-NEXT: v_readlane_b32 s21, v37, 17 +; GFX11-NEXT: s_lshl_b32 s20, s20, 8 +; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_or_b32 s20, s21, s22 +; GFX11-NEXT: v_readlane_b32 s21, v37, 16 +; GFX11-NEXT: s_and_b32 s22, s69, 0xff +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: v_readlane_b32 s69, v34, 21 +; GFX11-NEXT: s_lshl_b32 s21, s21, 8 +; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_or_b32 s19, s19, s21 +; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: s_and_b32 s19, s19, 0xffff +; GFX11-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-NEXT: s_lshl_b32 s20, s96, 8 +; GFX11-NEXT: s_or_b32 s19, s19, s21 +; GFX11-NEXT: s_and_b32 s21, s73, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s76, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s20 +; GFX11-NEXT: s_or_b32 s20, s21, s22 +; GFX11-NEXT: s_lshl_b32 s21, s87, 8 +; GFX11-NEXT: s_and_b32 s22, s72, 0xff +; GFX11-NEXT: s_lshl_b32 s23, s86, 8 +; GFX11-NEXT: s_or_b32 s17, s17, s21 +; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_readlane_b32 s18, v37, 0 +; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-NEXT: s_or_b32 s16, s16, s20 +; GFX11-NEXT: s_or_b32 s17, s17, s21 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 -; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 -; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 -; GFX11-NEXT: s_lshl_b32 s17, s76, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s17 +; GFX11-NEXT: s_lshl_b32 s16, s85, 8 +; GFX11-NEXT: s_and_b32 s17, s84, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: v_readlane_b32 s19, v37, 1 +; GFX11-NEXT: s_or_b32 s14, s14, s16 +; GFX11-NEXT: s_or_b32 s16, s17, s18 +; GFX11-NEXT: s_lshl_b32 s17, s83, 8 +; GFX11-NEXT: s_and_b32 s18, s82, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s81, 8 +; GFX11-NEXT: s_or_b32 s15, s15, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: v_readlane_b32 s18, v37, 2 +; GFX11-NEXT: s_and_b32 s14, s14, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_and_b32 s15, s15, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-NEXT: s_or_b32 s14, s14, s16 +; GFX11-NEXT: s_or_b32 s15, s15, s17 +; GFX11-NEXT: s_lshl_b32 s16, s61, 8 +; GFX11-NEXT: s_and_b32 s17, s80, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: v_readlane_b32 s19, v37, 3 +; GFX11-NEXT: s_or_b32 s12, s12, s16 +; GFX11-NEXT: s_or_b32 s16, s17, s18 ; GFX11-NEXT: s_lshl_b32 s17, s60, 8 ; GFX11-NEXT: s_and_b32 s18, s71, 0xff ; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s13, s13, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v5, s14 :: v_dual_mov_b32 v6, s15 +; GFX11-NEXT: v_readlane_b32 s14, v37, 4 +; GFX11-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-NEXT: s_or_b32 s12, s12, s16 +; GFX11-NEXT: s_or_b32 s13, s13, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s13 +; GFX11-NEXT: s_lshl_b32 s12, s58, 8 +; GFX11-NEXT: s_and_b32 s13, s59, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: v_readlane_b32 s15, v37, 5 +; GFX11-NEXT: s_or_b32 s10, s10, s12 +; GFX11-NEXT: s_or_b32 s12, s13, s14 +; GFX11-NEXT: s_lshl_b32 s13, s68, 8 ; GFX11-NEXT: s_and_b32 s14, s67, 0xff ; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-NEXT: s_or_b32 s11, s11, s13 +; GFX11-NEXT: s_or_b32 s13, s14, s15 +; GFX11-NEXT: v_readlane_b32 s14, v37, 6 +; GFX11-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_and_b32 s11, s11, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_or_b32 s10, s10, s12 +; GFX11-NEXT: s_or_b32 s11, s11, s13 +; GFX11-NEXT: s_lshl_b32 s12, s65, 8 +; GFX11-NEXT: s_and_b32 s13, s64, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s14 -; GFX11-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-NEXT: v_readlane_b32 s15, v37, 7 +; GFX11-NEXT: s_or_b32 s8, s8, s12 +; GFX11-NEXT: s_or_b32 s12, s13, s14 ; GFX11-NEXT: s_lshl_b32 s13, s55, 8 ; GFX11-NEXT: s_and_b32 s14, s54, 0xff ; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s9, s9, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xffff -; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 +; GFX11-NEXT: v_readlane_b32 s10, v37, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_or_b32 s8, s8, s12 +; GFX11-NEXT: s_or_b32 s9, s9, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 +; GFX11-NEXT: s_lshl_b32 s8, s52, 8 +; GFX11-NEXT: s_and_b32 s9, s51, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: v_readlane_b32 s11, v37, 9 +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_lshl_b32 s9, s50, 8 ; GFX11-NEXT: s_and_b32 s10, s49, 0xff ; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: v_readlane_b32 s10, v37, 10 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_lshl_b32 s8, s39, 8 +; GFX11-NEXT: s_and_b32 s9, s38, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s10 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-NEXT: v_readlane_b32 s11, v37, 11 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 ; GFX11-NEXT: s_lshl_b32 s9, s37, 8 ; GFX11-NEXT: s_and_b32 s10, s36, 0xff ; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s5, s5, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-NEXT: v_readlane_b32 s6, v37, 12 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX11-NEXT: s_lshl_b32 s4, s56, 8 +; GFX11-NEXT: s_and_b32 s5, s57, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: v_readlane_b32 s7, v37, 13 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 +; GFX11-NEXT: s_lshl_b32 s5, s34, 8 ; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff ; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: v_readlane_b32 s6, v37, 14 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_lshl_b32 s4, s47, 8 +; GFX11-NEXT: s_and_b32 s5, s104, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s6 -; GFX11-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-NEXT: v_readlane_b32 s7, v37, 15 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 ; GFX11-NEXT: s_lshl_b32 s5, s103, 8 ; GFX11-NEXT: s_and_b32 s6, s102, 0xff ; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v17, 8 -; GFX11-NEXT: v_readlane_b32 s103, v17, 7 -; GFX11-NEXT: v_readlane_b32 s102, v17, 6 -; GFX11-NEXT: v_readlane_b32 s101, v17, 5 -; GFX11-NEXT: v_readlane_b32 s96, v17, 0 -; GFX11-NEXT: v_readlane_b32 s87, v16, 31 -; GFX11-NEXT: v_readlane_b32 s85, v16, 29 -; GFX11-NEXT: v_readlane_b32 s84, v16, 28 -; GFX11-NEXT: v_readlane_b32 s83, v16, 27 -; GFX11-NEXT: v_readlane_b32 s82, v16, 26 -; GFX11-NEXT: v_readlane_b32 s81, v16, 25 -; GFX11-NEXT: v_readlane_b32 s80, v16, 24 -; GFX11-NEXT: v_readlane_b32 s71, v16, 23 -; GFX11-NEXT: v_readlane_b32 s70, v16, 22 -; GFX11-NEXT: v_readlane_b32 s68, v16, 20 -; GFX11-NEXT: v_readlane_b32 s67, v16, 19 -; GFX11-NEXT: v_readlane_b32 s66, v16, 18 -; GFX11-NEXT: v_readlane_b32 s65, v16, 17 -; GFX11-NEXT: v_readlane_b32 s64, v16, 16 -; GFX11-NEXT: v_readlane_b32 s55, v16, 15 -; GFX11-NEXT: v_readlane_b32 s54, v16, 14 -; GFX11-NEXT: v_readlane_b32 s53, v16, 13 -; GFX11-NEXT: v_readlane_b32 s52, v16, 12 -; GFX11-NEXT: v_readlane_b32 s51, v16, 11 -; GFX11-NEXT: v_readlane_b32 s50, v16, 10 -; GFX11-NEXT: v_readlane_b32 s49, v16, 9 -; GFX11-NEXT: v_readlane_b32 s48, v16, 8 -; GFX11-NEXT: v_readlane_b32 s39, v16, 7 -; GFX11-NEXT: v_readlane_b32 s38, v16, 6 -; GFX11-NEXT: v_readlane_b32 s37, v16, 5 -; GFX11-NEXT: v_readlane_b32 s36, v16, 4 -; GFX11-NEXT: v_readlane_b32 s35, v16, 3 -; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: v_readlane_b32 s104, v35, 8 +; GFX11-NEXT: v_readlane_b32 s103, v35, 7 +; GFX11-NEXT: v_readlane_b32 s102, v35, 6 +; GFX11-NEXT: v_readlane_b32 s101, v35, 5 +; GFX11-NEXT: v_readlane_b32 s96, v35, 0 +; GFX11-NEXT: v_readlane_b32 s87, v34, 31 +; GFX11-NEXT: v_readlane_b32 s86, v34, 30 +; GFX11-NEXT: v_readlane_b32 s85, v34, 29 +; GFX11-NEXT: v_readlane_b32 s84, v34, 28 +; GFX11-NEXT: v_readlane_b32 s83, v34, 27 +; GFX11-NEXT: v_readlane_b32 s82, v34, 26 +; GFX11-NEXT: v_readlane_b32 s81, v34, 25 +; GFX11-NEXT: v_readlane_b32 s80, v34, 24 +; GFX11-NEXT: v_readlane_b32 s71, v34, 23 +; GFX11-NEXT: v_readlane_b32 s70, v34, 22 +; GFX11-NEXT: v_readlane_b32 s68, v34, 20 +; GFX11-NEXT: v_readlane_b32 s67, v34, 19 +; GFX11-NEXT: v_readlane_b32 s66, v34, 18 +; GFX11-NEXT: v_readlane_b32 s65, v34, 17 +; GFX11-NEXT: v_readlane_b32 s64, v34, 16 +; GFX11-NEXT: v_readlane_b32 s55, v34, 15 +; GFX11-NEXT: v_readlane_b32 s54, v34, 14 +; GFX11-NEXT: v_readlane_b32 s53, v34, 13 +; GFX11-NEXT: v_readlane_b32 s52, v34, 12 +; GFX11-NEXT: v_readlane_b32 s51, v34, 11 +; GFX11-NEXT: v_readlane_b32 s50, v34, 10 +; GFX11-NEXT: v_readlane_b32 s49, v34, 9 +; GFX11-NEXT: v_readlane_b32 s48, v34, 8 +; GFX11-NEXT: v_readlane_b32 s39, v34, 7 +; GFX11-NEXT: v_readlane_b32 s38, v34, 6 +; GFX11-NEXT: v_readlane_b32 s37, v34, 5 +; GFX11-NEXT: v_readlane_b32 s36, v34, 4 +; GFX11-NEXT: v_readlane_b32 s35, v34, 3 +; GFX11-NEXT: v_readlane_b32 s34, v34, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v16, off, s32 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -23464,47 +23576,75 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 -; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s48, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s49, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s50, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s51, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s52, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s53, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s54, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s55, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s64, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s65, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s67, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s68, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_writelane_b32 v21, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s69, v20 ; SI-NEXT: v_readfirstlane_b32 s70, v1 ; SI-NEXT: v_readfirstlane_b32 s71, v2 ; SI-NEXT: v_readfirstlane_b32 s80, v3 @@ -23524,97 +23664,83 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_readfirstlane_b32 s8, v17 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s9, v18 -; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB17_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 1 -; SI-NEXT: s_lshl_b32 s4, s17, 16 -; SI-NEXT: v_writelane_b32 v21, s4, 0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: s_lshl_b32 s4, s16, 16 +; SI-NEXT: s_and_b32 s4, s49, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s49, 16 +; SI-NEXT: v_writelane_b32 v22, s4, 0 +; SI-NEXT: s_and_b32 s4, s48, 0xffff0000 +; SI-NEXT: v_writelane_b32 v22, s4, 2 +; SI-NEXT: s_lshl_b32 s4, s48, 16 ; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s9, 16 ; SI-NEXT: s_and_b32 s13, s8, 0xffff0000 ; SI-NEXT: s_lshl_b32 s12, s8, 16 ; SI-NEXT: s_and_b32 s15, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s14, s7, 16 -; SI-NEXT: s_and_b32 s41, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s6, 16 -; SI-NEXT: s_and_b32 s43, s99, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s99, 16 -; SI-NEXT: s_and_b32 s45, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s98, 16 -; SI-NEXT: s_and_b32 s47, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s97, 16 -; SI-NEXT: s_and_b32 s57, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s96, 16 -; SI-NEXT: s_and_b32 s59, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s87, 16 -; SI-NEXT: s_and_b32 s61, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s86, 16 -; SI-NEXT: s_and_b32 s63, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s85, 16 -; SI-NEXT: s_and_b32 s73, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s84, 16 -; SI-NEXT: s_and_b32 s75, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 -; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s27, 16 -; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s26, 16 -; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s25, 16 -; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s24, 16 -; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s23, 16 -; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s22, 16 -; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s21, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s20, 16 -; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s19, 16 -; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: s_and_b32 s17, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s6, 16 +; SI-NEXT: s_and_b32 s19, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s99, 16 +; SI-NEXT: s_and_b32 s21, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s98, 16 +; SI-NEXT: s_and_b32 s23, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s97, 16 +; SI-NEXT: s_and_b32 s25, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s96, 16 +; SI-NEXT: s_and_b32 s27, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s87, 16 +; SI-NEXT: s_and_b32 s29, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s86, 16 +; SI-NEXT: s_and_b32 s41, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s85, 16 +; SI-NEXT: s_and_b32 s43, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s84, 16 +; SI-NEXT: s_and_b32 s45, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s83, 16 +; SI-NEXT: s_and_b32 s47, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s82, 16 +; SI-NEXT: s_and_b32 s57, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s81, 16 +; SI-NEXT: s_and_b32 s59, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s80, 16 +; SI-NEXT: s_and_b32 s61, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s71, 16 +; SI-NEXT: s_and_b32 s63, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s70, 16 +; SI-NEXT: s_and_b32 s73, s69, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s69, 16 +; SI-NEXT: s_and_b32 s75, s68, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s68, 16 +; SI-NEXT: s_and_b32 s77, s67, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s67, 16 +; SI-NEXT: s_and_b32 s79, s66, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s66, 16 +; SI-NEXT: s_and_b32 s89, s65, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s65, 16 +; SI-NEXT: s_and_b32 s91, s64, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s64, 16 +; SI-NEXT: s_and_b32 s93, s55, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s55, 16 +; SI-NEXT: s_and_b32 s95, s54, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s54, 16 +; SI-NEXT: s_and_b32 s31, s53, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s53, 16 +; SI-NEXT: s_and_b32 s35, s52, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s52, 16 +; SI-NEXT: s_and_b32 s37, s51, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s51, 16 +; SI-NEXT: s_and_b32 s39, s50, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s50, 16 +; SI-NEXT: v_writelane_b32 v22, s4, 3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB17_3 ; SI-NEXT: .LBB17_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -23655,6 +23781,20 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 @@ -23676,8 +23816,22 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s11, s13 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 -; SI-NEXT: s_mov_b32 s14, s40 -; SI-NEXT: s_mov_b32 s15, s41 +; SI-NEXT: s_mov_b32 s14, s16 +; SI-NEXT: s_mov_b32 s15, s17 +; SI-NEXT: s_mov_b32 s16, s18 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s18, s20 +; SI-NEXT: s_mov_b32 s19, s21 +; SI-NEXT: s_mov_b32 s20, s22 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: s_mov_b32 s22, s24 +; SI-NEXT: s_mov_b32 s23, s25 +; SI-NEXT: s_mov_b32 s24, s26 +; SI-NEXT: s_mov_b32 s25, s27 +; SI-NEXT: s_mov_b32 s26, s28 +; SI-NEXT: s_mov_b32 s27, s29 +; SI-NEXT: s_mov_b32 s28, s40 +; SI-NEXT: s_mov_b32 s29, s41 ; SI-NEXT: s_mov_b32 s40, s42 ; SI-NEXT: s_mov_b32 s41, s43 ; SI-NEXT: s_mov_b32 s42, s44 @@ -23692,39 +23846,25 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s59, s61 ; SI-NEXT: s_mov_b32 s60, s62 ; SI-NEXT: s_mov_b32 s61, s63 -; SI-NEXT: s_mov_b32 s62, s72 -; SI-NEXT: s_mov_b32 s63, s73 -; SI-NEXT: s_mov_b32 s72, s74 -; SI-NEXT: s_mov_b32 s73, s75 -; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: s_mov_b32 s75, s77 -; SI-NEXT: s_mov_b32 s76, s78 -; SI-NEXT: s_mov_b32 s77, s79 -; SI-NEXT: s_mov_b32 s78, s88 -; SI-NEXT: s_mov_b32 s79, s89 -; SI-NEXT: s_mov_b32 s88, s90 -; SI-NEXT: s_mov_b32 s89, s91 -; SI-NEXT: s_mov_b32 s90, s92 -; SI-NEXT: s_mov_b32 s91, s93 -; SI-NEXT: v_readlane_b32 s92, v21, 0 -; SI-NEXT: v_readlane_b32 s93, v21, 1 +; SI-NEXT: v_readlane_b32 s62, v22, 0 +; SI-NEXT: v_readlane_b32 s63, v22, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s49, s49, 3 +; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_add_i32 s51, s51, 3 +; SI-NEXT: s_add_i32 s52, s52, 3 +; SI-NEXT: s_add_i32 s53, s53, 3 +; SI-NEXT: s_add_i32 s54, s54, 3 +; SI-NEXT: s_add_i32 s55, s55, 3 +; SI-NEXT: s_add_i32 s64, s64, 3 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s67, s67, 3 +; SI-NEXT: s_add_i32 s68, s68, 3 +; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_add_i32 s70, s70, 3 ; SI-NEXT: s_add_i32 s71, s71, 3 ; SI-NEXT: s_add_i32 s80, s80, 3 @@ -23744,266 +23884,266 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_and_b32 s15, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s14, s6, 16 -; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s6, s48, 0xffff0000 ; SI-NEXT: s_and_b32 s5, s9, 0xffff0000 ; SI-NEXT: s_lshl_b32 s4, s9, 16 ; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s8, 16 ; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s12, s7, 16 -; SI-NEXT: s_and_b32 s41, s99, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s99, 16 -; SI-NEXT: s_and_b32 s43, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s98, 16 -; SI-NEXT: s_and_b32 s45, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s97, 16 -; SI-NEXT: s_and_b32 s47, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s96, 16 -; SI-NEXT: s_and_b32 s57, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s87, 16 -; SI-NEXT: s_and_b32 s59, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s86, 16 -; SI-NEXT: s_and_b32 s61, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s85, 16 -; SI-NEXT: s_and_b32 s63, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s84, 16 -; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s75, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s77, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s81, 16 -; SI-NEXT: s_and_b32 s79, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s80, 16 -; SI-NEXT: s_and_b32 s89, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s71, 16 -; SI-NEXT: s_and_b32 s91, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s70, 16 -; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s27, 16 -; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s26, 16 -; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s25, 16 -; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s24, 16 -; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s23, 16 -; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s22, 16 -; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s21, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s20, 16 -; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s19, 16 -; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s93, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s17, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s6, 2 -; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: v_writelane_b32 v21, s6, 3 +; SI-NEXT: s_and_b32 s17, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s99, 16 +; SI-NEXT: s_and_b32 s19, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s98, 16 +; SI-NEXT: s_and_b32 s21, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s97, 16 +; SI-NEXT: s_and_b32 s23, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s96, 16 +; SI-NEXT: s_and_b32 s25, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s87, 16 +; SI-NEXT: s_and_b32 s27, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s86, 16 +; SI-NEXT: s_and_b32 s29, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s85, 16 +; SI-NEXT: s_and_b32 s41, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s84, 16 +; SI-NEXT: s_and_b32 s43, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s83, 16 +; SI-NEXT: s_and_b32 s45, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s82, 16 +; SI-NEXT: s_and_b32 s47, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s81, 16 +; SI-NEXT: s_and_b32 s57, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s80, 16 +; SI-NEXT: s_and_b32 s59, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s71, 16 +; SI-NEXT: s_and_b32 s61, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s70, 16 +; SI-NEXT: s_and_b32 s73, s69, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s69, 16 +; SI-NEXT: s_and_b32 s75, s68, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s68, 16 +; SI-NEXT: s_and_b32 s77, s67, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s67, 16 +; SI-NEXT: s_and_b32 s79, s66, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s66, 16 +; SI-NEXT: s_and_b32 s89, s65, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s65, 16 +; SI-NEXT: s_and_b32 s91, s64, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s64, 16 +; SI-NEXT: s_and_b32 s93, s55, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s55, 16 +; SI-NEXT: s_and_b32 s95, s54, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s54, 16 +; SI-NEXT: s_and_b32 s31, s53, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s53, 16 +; SI-NEXT: s_and_b32 s35, s52, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s52, 16 +; SI-NEXT: s_and_b32 s37, s51, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s51, 16 +; SI-NEXT: s_and_b32 s39, s50, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s50, 16 +; SI-NEXT: s_and_b32 s63, s49, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s49, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s6, 2 +; SI-NEXT: s_lshl_b32 s6, s48, 16 +; SI-NEXT: v_writelane_b32 v22, s6, 3 ; SI-NEXT: .LBB17_5: ; %end -; SI-NEXT: v_readlane_b32 s6, v21, 2 +; SI-NEXT: v_readlane_b32 s6, v22, 2 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_readlane_b32 s6, v21, 3 +; SI-NEXT: v_readlane_b32 s6, v22, 3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s99, v21, 35 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -24035,44 +24175,44 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s98, v21, 34 +; SI-NEXT: v_readlane_b32 s97, v21, 33 +; SI-NEXT: v_readlane_b32 s96, v21, 32 +; SI-NEXT: v_readlane_b32 s87, v21, 31 +; SI-NEXT: v_readlane_b32 s86, v21, 30 +; SI-NEXT: v_readlane_b32 s85, v21, 29 +; SI-NEXT: v_readlane_b32 s84, v21, 28 +; SI-NEXT: v_readlane_b32 s83, v21, 27 +; SI-NEXT: v_readlane_b32 s82, v21, 26 +; SI-NEXT: v_readlane_b32 s81, v21, 25 +; SI-NEXT: v_readlane_b32 s80, v21, 24 +; SI-NEXT: v_readlane_b32 s71, v21, 23 +; SI-NEXT: v_readlane_b32 s70, v21, 22 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -32188,20 +32328,48 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_readfirstlane_b32 s26, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_readfirstlane_b32 s27, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s28, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s29, v20 +; SI-NEXT: v_readfirstlane_b32 s23, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s18, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s16, v8 ; SI-NEXT: v_readfirstlane_b32 s15, v9 ; SI-NEXT: v_readfirstlane_b32 s14, v10 ; SI-NEXT: v_readfirstlane_b32 s13, v11 @@ -32254,21 +32422,21 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 @@ -32282,21 +32450,21 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s47, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s46, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 @@ -32307,29 +32475,29 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_i32 s9, s9, 3 @@ -32337,31 +32505,31 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -32370,28 +32538,28 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s56, s18, 16 -; SI-NEXT: s_lshr_b32 s57, s19, 16 -; SI-NEXT: s_lshr_b32 s58, s20, 16 -; SI-NEXT: s_lshr_b32 s59, s21, 16 -; SI-NEXT: s_lshr_b32 s60, s22, 16 -; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s5, s41, 16 +; SI-NEXT: s_lshr_b32 s56, s42, 16 +; SI-NEXT: s_lshr_b32 s57, s43, 16 +; SI-NEXT: s_lshr_b32 s58, s44, 16 +; SI-NEXT: s_lshr_b32 s59, s45, 16 +; SI-NEXT: s_lshr_b32 s60, s46, 16 +; SI-NEXT: s_lshr_b32 s61, s47, 16 ; SI-NEXT: s_lshr_b32 s62, s24, 16 ; SI-NEXT: s_lshr_b32 s63, s25, 16 ; SI-NEXT: s_lshr_b32 s72, s26, 16 ; SI-NEXT: s_lshr_b32 s73, s27, 16 ; SI-NEXT: s_lshr_b32 s74, s28, 16 ; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s47, 16 -; SI-NEXT: s_lshr_b32 s77, s46, 16 -; SI-NEXT: s_lshr_b32 s78, s45, 16 -; SI-NEXT: s_lshr_b32 s79, s44, 16 -; SI-NEXT: s_lshr_b32 s88, s43, 16 -; SI-NEXT: s_lshr_b32 s89, s42, 16 -; SI-NEXT: s_lshr_b32 s90, s41, 16 -; SI-NEXT: s_lshr_b32 s91, s40, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s22, 16 +; SI-NEXT: s_lshr_b32 s78, s21, 16 +; SI-NEXT: s_lshr_b32 s79, s20, 16 +; SI-NEXT: s_lshr_b32 s88, s19, 16 +; SI-NEXT: s_lshr_b32 s89, s18, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s16, 16 ; SI-NEXT: s_lshr_b32 s92, s15, 16 ; SI-NEXT: s_lshr_b32 s93, s14, 16 ; SI-NEXT: s_lshr_b32 s94, s13, 16 @@ -32410,28 +32578,28 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 @@ -35746,37 +35914,65 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: v_readfirstlane_b32 s14, v7 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 @@ -35788,9 +35984,9 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v21, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 16 @@ -35799,54 +35995,54 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s49, s11, 16 ; SI-NEXT: s_lshr_b32 s50, s13, 16 ; SI-NEXT: s_lshr_b32 s51, s15, 16 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s53, s43, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 16 -; SI-NEXT: s_lshr_b32 s55, s29, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 16 -; SI-NEXT: s_lshr_b32 s67, s21, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 16 -; SI-NEXT: s_lshr_b32 s69, s17, 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -35859,166 +36055,166 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 ; SI-NEXT: s_lshr_b32 s38, s5, 16 ; SI-NEXT: s_lshr_b32 s39, s7, 16 ; SI-NEXT: s_lshr_b32 s48, s9, 16 ; SI-NEXT: s_lshr_b32 s49, s11, 16 ; SI-NEXT: s_lshr_b32 s50, s13, 16 ; SI-NEXT: s_lshr_b32 s51, s15, 16 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s53, s43, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 16 -; SI-NEXT: s_lshr_b32 s55, s29, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 16 -; SI-NEXT: s_lshr_b32 s67, s21, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 16 -; SI-NEXT: s_lshr_b32 s69, s17, 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_lshl_b32 s47, s36, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s69, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s34, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s68, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s30, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xffff +; SI-NEXT: s_lshl_b32 s29, s69, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s34, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v5, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s67, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s94, 16 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s66, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s66, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s92, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s65, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s64, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s55, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xffff -; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s54, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s53, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -36082,7 +36278,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -36096,7 +36292,7 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s46, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -36110,30 +36306,30 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -36168,8 +36364,8 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: @@ -37661,23 +37857,51 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 ; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 ; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 ; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 ; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 ; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 ; VI-NEXT: v_readfirstlane_b32 s15, v11 -; VI-NEXT: v_readfirstlane_b32 s40, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v13 -; VI-NEXT: v_readfirstlane_b32 s42, v14 -; VI-NEXT: v_readfirstlane_b32 s43, v15 -; VI-NEXT: v_readfirstlane_b32 s44, v16 -; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s17, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: v_readfirstlane_b32 s24, v3 +; VI-NEXT: v_readfirstlane_b32 s25, v4 +; VI-NEXT: v_readfirstlane_b32 s26, v5 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_readfirstlane_b32 s43, v12 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_readfirstlane_b32 s45, v14 ; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s47, v1 @@ -37694,8 +37918,38 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s5, s45, 3 ; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -37764,38 +38018,8 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s45, 3 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s44, 3 -; VI-NEXT: s_add_i32 s45, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s43, 3 -; VI-NEXT: s_add_i32 s44, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s42, 3 -; VI-NEXT: s_add_i32 s43, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s41, 3 -; VI-NEXT: s_add_i32 s42, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s40, 3 -; VI-NEXT: s_add_i32 s41, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s15, 3 -; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -37846,20 +38070,20 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB27_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s22 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s24 +; VI-NEXT: v_mov_b32_e32 v3, s25 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: v_mov_b32_e32 v6, s28 +; VI-NEXT: v_mov_b32_e32 v7, s29 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s41 +; VI-NEXT: v_mov_b32_e32 v10, s42 +; VI-NEXT: v_mov_b32_e32 v11, s43 +; VI-NEXT: v_mov_b32_e32 v12, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s46 ; VI-NEXT: v_mov_b32_e32 v15, s47 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -37872,12 +38096,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v23, s13 ; VI-NEXT: v_mov_b32_e32 v24, s14 ; VI-NEXT: v_mov_b32_e32 v25, s15 -; VI-NEXT: v_mov_b32_e32 v26, s40 -; VI-NEXT: v_mov_b32_e32 v27, s41 -; VI-NEXT: v_mov_b32_e32 v28, s42 -; VI-NEXT: v_mov_b32_e32 v29, s43 -; VI-NEXT: v_mov_b32_e32 v30, s44 -; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: v_mov_b32_e32 v26, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v28, s18 +; VI-NEXT: v_mov_b32_e32 v29, s19 +; VI-NEXT: v_mov_b32_e32 v30, s20 +; VI-NEXT: v_mov_b32_e32 v31, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: ; VI-NEXT: s_branch .LBB27_2 @@ -44536,9 +44760,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -44563,40 +44787,68 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s58, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s59, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_readfirstlane_b32 s22, v1 +; SI-NEXT: v_readfirstlane_b32 s23, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v3 +; SI-NEXT: v_readfirstlane_b32 s21, v4 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_readfirstlane_b32 s17, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -44614,497 +44866,551 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 17 -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s9, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 41 -; SI-NEXT: s_lshr_b32 s46, s21, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 40 -; SI-NEXT: s_lshr_b32 s46, s19, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 39 -; SI-NEXT: s_lshr_b32 s46, s19, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 38 -; SI-NEXT: s_lshr_b32 s46, s19, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 37 -; SI-NEXT: s_lshr_b32 s46, s17, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 36 -; SI-NEXT: s_lshr_b32 s46, s17, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 35 -; SI-NEXT: s_lshr_b32 s46, s17, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 34 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 32 -; SI-NEXT: v_writelane_b32 v61, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 30 -; SI-NEXT: v_writelane_b32 v61, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 28 -; SI-NEXT: v_writelane_b32 v61, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 26 -; SI-NEXT: v_writelane_b32 v61, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 24 -; SI-NEXT: v_writelane_b32 v61, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 22 -; SI-NEXT: v_writelane_b32 v61, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 20 -; SI-NEXT: v_writelane_b32 v61, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 18 -; SI-NEXT: v_writelane_b32 v61, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 16 -; SI-NEXT: v_writelane_b32 v61, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 14 -; SI-NEXT: v_writelane_b32 v61, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 12 -; SI-NEXT: v_writelane_b32 v61, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 10 -; SI-NEXT: v_writelane_b32 v61, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 8 -; SI-NEXT: v_writelane_b32 v61, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 6 -; SI-NEXT: v_writelane_b32 v61, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 4 -; SI-NEXT: v_writelane_b32 v61, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 2 -; SI-NEXT: v_writelane_b32 v61, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 0 -; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v61, s47, 1 -; SI-NEXT: s_lshr_b64 s[50:51], s[40:41], 24 -; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 8 +; SI-NEXT: s_lshr_b32 s4, s7, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 17 +; SI-NEXT: s_lshr_b32 s4, s9, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: s_lshr_b32 s4, s13, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_lshr_b32 s4, s13, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: s_lshr_b32 s4, s15, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s15, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: s_lshr_b32 s4, s17, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 7 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: s_lshr_b32 s4, s17, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 5 +; SI-NEXT: s_lshr_b32 s4, s19, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshr_b32 s4, s19, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_lshr_b32 s4, s21, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: s_lshr_b32 s4, s21, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 63 +; SI-NEXT: s_lshr_b32 s4, s23, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 62 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 61 +; SI-NEXT: s_lshr_b32 s4, s23, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 60 +; SI-NEXT: s_lshr_b32 s4, s25, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 59 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 58 +; SI-NEXT: s_lshr_b32 s4, s25, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 57 +; SI-NEXT: s_lshr_b32 s4, s41, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 56 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 55 +; SI-NEXT: s_lshr_b32 s4, s41, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 54 +; SI-NEXT: s_lshr_b32 s4, s43, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 53 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 52 +; SI-NEXT: s_lshr_b32 s4, s43, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 51 +; SI-NEXT: s_lshr_b32 s4, s45, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 50 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 49 +; SI-NEXT: s_lshr_b32 s4, s45, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 48 +; SI-NEXT: s_lshr_b32 s4, s47, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 47 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 46 +; SI-NEXT: s_lshr_b32 s4, s47, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 45 +; SI-NEXT: s_lshr_b32 s4, s57, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 44 +; SI-NEXT: s_lshr_b32 s4, s57, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 43 +; SI-NEXT: s_lshr_b32 s4, s59, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 42 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 40 +; SI-NEXT: v_writelane_b32 v61, s5, 41 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 38 +; SI-NEXT: v_writelane_b32 v61, s5, 39 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 36 +; SI-NEXT: v_writelane_b32 v61, s5, 37 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: v_writelane_b32 v61, s5, 35 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s5, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s5, 29 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s5, 23 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s5, 21 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[56:57], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_writelane_b32 v61, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[58:59], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: s_lshr_b32 s27, s7, 16 +; SI-NEXT: s_lshr_b32 s29, s7, 8 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s28, s11, 8 +; SI-NEXT: s_lshr_b32 s60, s57, 24 +; SI-NEXT: s_lshr_b32 s96, s59, 24 +; SI-NEXT: s_lshr_b32 s97, s59, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 ; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[56:57], 16 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: s_lshr_b64 s[98:99], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[58:59], 8 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s5, 1.0 -; SI-NEXT: v_add_f32_e64 v1, s4, 1.0 -; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 8 -; SI-NEXT: v_add_f32_e64 v4, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s6, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 8 -; SI-NEXT: v_add_f32_e64 v6, s9, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s8, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 8 -; SI-NEXT: v_add_f32_e64 v8, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s10, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 8 -; SI-NEXT: v_add_f32_e64 v10, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s12, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 8 -; SI-NEXT: v_add_f32_e64 v12, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s14, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 8 -; SI-NEXT: v_add_f32_e64 v16, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 8 -; SI-NEXT: v_add_f32_e64 v21, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s42, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 8 -; SI-NEXT: v_add_f32_e64 v26, s45, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s44, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 8 -; SI-NEXT: v_add_f32_e64 v30, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s28, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 8 -; SI-NEXT: v_add_f32_e64 v36, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s26, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 8 -; SI-NEXT: v_add_f32_e64 v49, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v48, s24, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v1, s14, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v1, s12, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s12, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v1, s10, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s10, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v1, s8, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v1, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s19, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; SI-NEXT: v_add_f32_e64 v15, s21, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_add_f32_e64 v17, s23, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_add_f32_e64 v21, s41, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_add_f32_e64 v37, s59, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s58, 1.0 +; SI-NEXT: v_add_f32_e64 v32, s57, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s56, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s57, v32 +; SI-NEXT: v_readfirstlane_b32 s46, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: v_readfirstlane_b32 s44, v27 +; SI-NEXT: v_readfirstlane_b32 s45, v25 +; SI-NEXT: v_readfirstlane_b32 s42, v24 +; SI-NEXT: v_readfirstlane_b32 s43, v23 +; SI-NEXT: v_readfirstlane_b32 s40, v22 +; SI-NEXT: v_readfirstlane_b32 s41, v21 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v23 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 8 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 -; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 -; SI-NEXT: v_add_f32_e64 v58, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 -; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v30 -; SI-NEXT: v_lshr_b64 v[32:33], v[40:41], 8 -; SI-NEXT: v_lshr_b64 v[37:38], v[45:46], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[57:58], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 -; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 -; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 -; SI-NEXT: v_lshr_b64 v[50:51], v[57:58], 24 -; SI-NEXT: v_lshr_b64 v[43:44], v[57:58], 8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v46 -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v58 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 0 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 1 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 0 +; SI-NEXT: v_writelane_b32 v61, s61, 1 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 2 +; SI-NEXT: v_writelane_b32 v61, s61, 3 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 4 +; SI-NEXT: v_writelane_b32 v61, s61, 5 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 6 +; SI-NEXT: v_writelane_b32 v61, s61, 7 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 8 +; SI-NEXT: v_writelane_b32 v61, s61, 9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 10 +; SI-NEXT: v_writelane_b32 v61, s61, 11 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 12 +; SI-NEXT: v_writelane_b32 v61, s61, 13 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 14 +; SI-NEXT: v_writelane_b32 v61, s61, 15 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 16 +; SI-NEXT: v_writelane_b32 v61, s61, 17 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 18 +; SI-NEXT: v_writelane_b32 v61, s61, 19 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 20 +; SI-NEXT: v_writelane_b32 v61, s61, 21 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 22 +; SI-NEXT: v_writelane_b32 v61, s61, 23 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 24 +; SI-NEXT: v_writelane_b32 v61, s61, 25 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 26 +; SI-NEXT: v_writelane_b32 v61, s61, 27 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 28 +; SI-NEXT: v_writelane_b32 v61, s61, 29 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 30 +; SI-NEXT: v_writelane_b32 v61, s61, 31 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 32 +; SI-NEXT: v_writelane_b32 v61, s61, 33 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 34 +; SI-NEXT: v_writelane_b32 v61, s61, 35 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 36 +; SI-NEXT: v_writelane_b32 v61, s61, 37 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 38 +; SI-NEXT: v_writelane_b32 v61, s61, 39 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 40 +; SI-NEXT: v_writelane_b32 v61, s61, 41 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -45120,633 +45426,335 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 2 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 3 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 4 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 5 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 6 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 7 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 8 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 9 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 10 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 11 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 12 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 13 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 14 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 15 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 16 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 17 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 18 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 19 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 20 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 21 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 22 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 23 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 24 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 25 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 26 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 27 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 28 -; SI-NEXT: v_writelane_b32 v61, s49, 29 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 30 -; SI-NEXT: v_writelane_b32 v61, s49, 31 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 32 -; SI-NEXT: v_writelane_b32 v61, s49, 33 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 34 -; SI-NEXT: v_mov_b32_e32 v54, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 35 -; SI-NEXT: v_mov_b32_e32 v51, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 36 -; SI-NEXT: v_mov_b32_e32 v39, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 37 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 38 -; SI-NEXT: v_mov_b32_e32 v22, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 39 -; SI-NEXT: v_mov_b32_e32 v18, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 40 -; SI-NEXT: v_mov_b32_e32 v34, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 41 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 42 -; SI-NEXT: v_mov_b32_e32 v24, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 43 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 44 -; SI-NEXT: v_mov_b32_e32 v19, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 45 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 47 -; SI-NEXT: v_mov_b32_e32 v59, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 48 -; SI-NEXT: v_mov_b32_e32 v56, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 49 -; SI-NEXT: v_mov_b32_e32 v47, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 50 -; SI-NEXT: v_mov_b32_e32 v44, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 51 -; SI-NEXT: v_mov_b32_e32 v55, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 52 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 53 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 54 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 55 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 56 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 57 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 59 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 60 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 61 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 63 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 5 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s5, v61, 42 +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 43 +; SI-NEXT: v_mov_b32_e32 v53, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 44 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 45 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 46 +; SI-NEXT: v_mov_b32_e32 v51, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 47 +; SI-NEXT: v_mov_b32_e32 v50, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 48 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 49 +; SI-NEXT: v_mov_b32_e32 v49, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 50 +; SI-NEXT: v_mov_b32_e32 v39, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 51 +; SI-NEXT: v_mov_b32_e32 v38, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 52 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 53 +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 54 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 55 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 56 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 57 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 58 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 59 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 60 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 61 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 62 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 63 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 0 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_mov_b32_e32 v35, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: v_mov_b32_e32 v34, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_mov_b32_e32 v33, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: v_mov_b32_e32 v30, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 7 +; SI-NEXT: v_mov_b32_e32 v28, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: v_mov_b32_e32 v26, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 9 +; SI-NEXT: v_mov_b32_e32 v60, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 10 +; SI-NEXT: v_mov_b32_e32 v59, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 11 +; SI-NEXT: v_mov_b32_e32 v58, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 12 +; SI-NEXT: v_mov_b32_e32 v57, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 13 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v56, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v46, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 8 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v45, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 9 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v42, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s5 +; SI-NEXT: v_mov_b32_e32 v37, s59 +; SI-NEXT: v_mov_b32_e32 v32, s57 +; SI-NEXT: v_mov_b32_e32 v29, s47 +; SI-NEXT: v_mov_b32_e32 v25, s45 +; SI-NEXT: v_mov_b32_e32 v23, s43 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v54, s96 +; SI-NEXT: v_mov_b32_e32 v52, s60 +; SI-NEXT: v_mov_b32_e32 v47, s28 +; SI-NEXT: v_mov_b32_e32 v44, s26 +; SI-NEXT: v_mov_b32_e32 v43, s61 +; SI-NEXT: v_mov_b32_e32 v41, s29 +; SI-NEXT: v_mov_b32_e32 v40, s27 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v24, s42 +; SI-NEXT: v_mov_b32_e32 v27, s44 +; SI-NEXT: v_mov_b32_e32 v31, s46 +; SI-NEXT: v_mov_b32_e32 v36, s56 +; SI-NEXT: v_readlane_b32 s26, v61, 40 +; SI-NEXT: v_readlane_b32 s28, v61, 38 +; SI-NEXT: v_readlane_b32 s6, v61, 36 +; SI-NEXT: v_readlane_b32 s58, v61, 34 +; SI-NEXT: v_readlane_b32 s60, v61, 32 +; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s10, v61, 28 +; SI-NEXT: v_readlane_b32 s12, v61, 26 +; SI-NEXT: v_readlane_b32 s14, v61, 24 +; SI-NEXT: v_readlane_b32 s16, v61, 22 +; SI-NEXT: s_mov_b32 s96, s94 +; SI-NEXT: v_readlane_b32 s94, v61, 20 +; SI-NEXT: v_readlane_b32 s18, v61, 18 +; SI-NEXT: v_readlane_b32 s20, v61, 16 +; SI-NEXT: v_readlane_b32 s22, v61, 14 +; SI-NEXT: v_readlane_b32 s24, v61, 12 +; SI-NEXT: v_readlane_b32 s40, v61, 10 +; SI-NEXT: v_readlane_b32 s42, v61, 8 +; SI-NEXT: v_readlane_b32 s44, v61, 6 +; SI-NEXT: v_readlane_b32 s46, v61, 4 +; SI-NEXT: v_readlane_b32 s56, v61, 2 +; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 +; SI-NEXT: v_mov_b32_e32 v55, s97 +; SI-NEXT: v_readlane_b32 s27, v61, 41 +; SI-NEXT: v_readlane_b32 s29, v61, 39 +; SI-NEXT: v_readlane_b32 s7, v61, 37 +; SI-NEXT: v_readlane_b32 s59, v61, 35 +; SI-NEXT: v_readlane_b32 s61, v61, 33 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: v_readlane_b32 s11, v61, 29 +; SI-NEXT: v_readlane_b32 s13, v61, 27 +; SI-NEXT: v_readlane_b32 s15, v61, 25 +; SI-NEXT: v_readlane_b32 s17, v61, 23 +; SI-NEXT: v_readlane_b32 s95, v61, 21 +; SI-NEXT: v_readlane_b32 s19, v61, 19 +; SI-NEXT: v_readlane_b32 s21, v61, 17 +; SI-NEXT: v_readlane_b32 s23, v61, 15 +; SI-NEXT: v_readlane_b32 s25, v61, 13 +; SI-NEXT: v_readlane_b32 s41, v61, 11 +; SI-NEXT: v_readlane_b32 s43, v61, 9 +; SI-NEXT: v_readlane_b32 s45, v61, 7 +; SI-NEXT: v_readlane_b32 s47, v61, 5 +; SI-NEXT: v_readlane_b32 s57, v61, 3 +; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: v_or_b32_e32 v48, s4, v48 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 24 +; SI-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v48, s4, v48 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 +; SI-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v54 +; SI-NEXT: v_or_b32_e32 v37, v48, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 +; SI-NEXT: s_lshl_b32 s4, s56, 8 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s84, 24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: v_add_i32_e32 v36, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v53 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v52 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v8, 0xff, v31 +; SI-NEXT: s_lshl_b32 s4, s46, 8 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s80, 24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v8, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v50 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xff, v27 +; SI-NEXT: s_lshl_b32 s4, s44, 8 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s68, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v39 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s48 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 33 -; SI-NEXT: v_readlane_b32 s4, v61, 30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: v_readlane_b32 s4, v61, 28 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: v_readlane_b32 s4, v61, 26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 27 -; SI-NEXT: v_readlane_b32 s4, v61, 24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 25 -; SI-NEXT: v_readlane_b32 s4, v61, 22 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 23 -; SI-NEXT: v_readlane_b32 s4, v61, 20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 21 -; SI-NEXT: v_readlane_b32 s4, v61, 18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 19 -; SI-NEXT: v_readlane_b32 s4, v61, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: v_readlane_b32 s4, v61, 14 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: v_readlane_b32 s4, v61, 12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: v_readlane_b32 s4, v61, 10 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: v_readlane_b32 s4, v61, 8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: v_readlane_b32 s4, v61, 6 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 7 -; SI-NEXT: v_readlane_b32 s4, v61, 4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 5 -; SI-NEXT: v_readlane_b32 s4, v61, 2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 3 -; SI-NEXT: v_readlane_b32 s4, v61, 0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s50 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s52 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s54 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s64 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s66 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s68 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s70 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s80 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s82 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s84 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s86 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s96 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s98 -; SI-NEXT: v_mov_b32_e32 v27, s62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s46 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s72 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s74 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s58 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s76 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, s16 -; SI-NEXT: v_mov_b32_e32 v58, s17 -; SI-NEXT: v_mov_b32_e32 v45, s18 -; SI-NEXT: v_mov_b32_e32 v46, s19 -; SI-NEXT: v_mov_b32_e32 v40, s20 -; SI-NEXT: v_mov_b32_e32 v41, s21 -; SI-NEXT: v_mov_b32_e32 v52, s22 -; SI-NEXT: v_mov_b32_e32 v53, s23 -; SI-NEXT: v_mov_b32_e32 v48, s24 -; SI-NEXT: v_mov_b32_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v35, s26 -; SI-NEXT: v_mov_b32_e32 v36, s27 -; SI-NEXT: v_mov_b32_e32 v29, s28 -; SI-NEXT: v_mov_b32_e32 v30, s29 -; SI-NEXT: v_mov_b32_e32 v25, s44 -; SI-NEXT: v_mov_b32_e32 v26, s45 -; SI-NEXT: v_mov_b32_e32 v20, s42 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_mov_b32_e32 v15, s40 -; SI-NEXT: v_mov_b32_e32 v16, s41 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v12, s15 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v7, s10 -; SI-NEXT: v_mov_b32_e32 v8, s11 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s60 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s78 -; SI-NEXT: v_mov_b32_e32 v31, s88 -; SI-NEXT: v_mov_b32_e32 v32, s90 -; SI-NEXT: v_mov_b32_e32 v33, s92 -; SI-NEXT: v_mov_b32_e32 v37, s94 -; SI-NEXT: v_mov_b32_e32 v38, s30 -; SI-NEXT: v_mov_b32_e32 v50, s34 -; SI-NEXT: v_mov_b32_e32 v42, s36 -; SI-NEXT: v_mov_b32_e32 v43, s38 -; SI-NEXT: .LBB37_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v43 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v43, v57, v43 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_or_b32_e32 v50, v50, v42 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v50, v42, v50 -; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v50, v50, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v39 -; SI-NEXT: v_or_b32_e32 v39, v39, v51 -; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v39, v50, v39 -; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xff, v45 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 -; SI-NEXT: v_or_b32_e32 v38, v39, v38 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v33, v37, v33 -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v24 +; SI-NEXT: s_lshl_b32 s4, s42, 8 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s64, 24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v23, v33, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v32 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v34 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_lshl_b32 s4, s40, 8 +; SI-NEXT: s_lshl_b32 s5, s52, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 ; SI-NEXT: v_readlane_b32 s87, v63, 31 ; SI-NEXT: v_readlane_b32 s86, v63, 30 ; SI-NEXT: v_readlane_b32 s85, v63, 29 @@ -45764,488 +45772,324 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_readlane_b32 s65, v63, 17 ; SI-NEXT: v_readlane_b32 s64, v63, 16 ; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 ; SI-NEXT: v_readlane_b32 s53, v63, 13 ; SI-NEXT: v_readlane_b32 s52, v63, 12 ; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 ; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 ; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v60 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s54, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: s_lshl_b32 s4, s24, 8 +; SI-NEXT: s_lshl_b32 s5, s48, 24 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v55 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s22, 8 +; SI-NEXT: s_lshl_b32 s5, s36, 24 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s20, 8 +; SI-NEXT: s_lshl_b32 s5, s30, 24 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v21 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s18, 8 +; SI-NEXT: s_lshl_b32 s5, s94, 24 +; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s96, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34 +; SI-NEXT: s_lshl_b32 s4, s16, 8 +; SI-NEXT: s_lshl_b32 s5, s90, 24 +; SI-NEXT: v_readlane_b32 s96, v63, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s14, 8 +; SI-NEXT: s_lshl_b32 s5, s78, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s12, 8 +; SI-NEXT: s_lshl_b32 s5, s74, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s10, 8 +; SI-NEXT: s_lshl_b32 s5, s62, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s72, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v46 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s8, 8 +; SI-NEXT: s_lshl_b32 s5, s58, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s6, 8 +; SI-NEXT: s_lshl_b32 s5, s26, 24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -46263,9 +46107,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -46295,27 +46139,55 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_writelane_b32 v63, s55, 15 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_mov_b32_e32 v20, s16 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s56, v20 +; VI-NEXT: v_mov_b32_e32 v20, s17 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s57, v20 +; VI-NEXT: v_mov_b32_e32 v20, s18 ; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_readfirstlane_b32 s46, v20 +; VI-NEXT: v_mov_b32_e32 v20, s19 ; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_readfirstlane_b32 s47, v20 +; VI-NEXT: v_mov_b32_e32 v20, s20 ; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_readfirstlane_b32 s44, v20 +; VI-NEXT: v_mov_b32_e32 v20, s21 ; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_readfirstlane_b32 s45, v20 +; VI-NEXT: v_mov_b32_e32 v20, s22 ; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_readfirstlane_b32 s42, v20 +; VI-NEXT: v_mov_b32_e32 v20, s23 ; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_readfirstlane_b32 s43, v20 +; VI-NEXT: v_mov_b32_e32 v20, s24 ; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s40, v20 +; VI-NEXT: v_mov_b32_e32 v20, s25 ; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_readfirstlane_b32 s41, v20 +; VI-NEXT: v_mov_b32_e32 v20, s26 ; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_readfirstlane_b32 s24, v20 +; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_readfirstlane_b32 s25, v20 +; VI-NEXT: v_mov_b32_e32 v20, s28 ; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s22, v20 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s45, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s43, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 -; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s23, v20 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s15, v8 ; VI-NEXT: v_readfirstlane_b32 s12, v9 @@ -46327,7 +46199,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readfirstlane_b32 s6, v15 ; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s4, v17 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v18 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -46346,160 +46218,160 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB37_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: s_lshr_b32 s80, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s81, s24, 8 -; VI-NEXT: s_lshr_b32 s82, s23, 24 -; VI-NEXT: s_lshr_b32 s83, s23, 16 -; VI-NEXT: s_lshr_b32 s85, s23, 8 -; VI-NEXT: s_lshr_b32 s84, s22, 16 -; VI-NEXT: s_lshr_b32 s86, s22, 8 -; VI-NEXT: s_lshr_b32 s87, s21, 24 -; VI-NEXT: s_lshr_b32 s50, s21, 16 -; VI-NEXT: s_lshr_b32 s52, s21, 8 -; VI-NEXT: s_lshr_b32 s51, s20, 16 -; VI-NEXT: s_lshr_b32 s53, s20, 8 -; VI-NEXT: s_lshr_b32 s54, s19, 24 -; VI-NEXT: s_lshr_b32 s55, s19, 16 -; VI-NEXT: s_lshr_b32 s65, s19, 8 -; VI-NEXT: s_lshr_b32 s64, s18, 16 -; VI-NEXT: s_lshr_b32 s66, s18, 8 -; VI-NEXT: s_lshr_b32 s67, s17, 24 -; VI-NEXT: s_lshr_b32 s68, s17, 16 -; VI-NEXT: s_lshr_b32 s70, s17, 8 -; VI-NEXT: s_lshr_b32 s69, s16, 16 -; VI-NEXT: s_lshr_b32 s71, s16, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 7 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 6 +; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: s_lshr_b32 s26, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: s_lshr_b32 s26, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: s_lshr_b32 s26, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 2 +; VI-NEXT: s_lshr_b32 s26, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 1 +; VI-NEXT: s_lshr_b32 s26, s40, 16 +; VI-NEXT: s_lshr_b32 s80, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 0 +; VI-NEXT: s_lshr_b32 s81, s40, 8 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s83, s43, 16 +; VI-NEXT: s_lshr_b32 s85, s43, 8 +; VI-NEXT: s_lshr_b32 s84, s42, 16 +; VI-NEXT: s_lshr_b32 s86, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s45, 24 +; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: s_lshr_b32 s52, s45, 8 +; VI-NEXT: s_lshr_b32 s51, s44, 16 +; VI-NEXT: s_lshr_b32 s53, s44, 8 +; VI-NEXT: s_lshr_b32 s54, s47, 24 +; VI-NEXT: s_lshr_b32 s55, s47, 16 +; VI-NEXT: s_lshr_b32 s65, s47, 8 +; VI-NEXT: s_lshr_b32 s64, s46, 16 +; VI-NEXT: s_lshr_b32 s66, s46, 8 +; VI-NEXT: s_lshr_b32 s67, s57, 24 +; VI-NEXT: s_lshr_b32 s68, s57, 16 +; VI-NEXT: s_lshr_b32 s70, s57, 8 +; VI-NEXT: s_lshr_b32 s69, s56, 16 +; VI-NEXT: s_lshr_b32 s71, s56, 8 +; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB37_4 ; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 @@ -46530,28 +46402,28 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: v_add_f32_e64 v14, s41, 1.0 -; VI-NEXT: v_add_f32_e64 v13, s40, 1.0 +; VI-NEXT: v_add_f32_e64 v14, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s16, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: v_add_f32_e64 v16, s43, 1.0 -; VI-NEXT: v_add_f32_e64 v15, s42, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: v_add_f32_e64 v18, s45, 1.0 -; VI-NEXT: v_add_f32_e64 v17, s44, 1.0 +; VI-NEXT: v_add_f32_e64 v18, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s20, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_add_f32_e64 v20, s29, 1.0 -; VI-NEXT: v_add_f32_e64 v19, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v20, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v19, s22, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; VI-NEXT: v_add_f32_e64 v21, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v22, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v21, s24, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] @@ -46660,21 +46532,21 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_add_f32_e64 v28, s21, 1.0 -; VI-NEXT: v_add_f32_e64 v27, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v28, s45, 1.0 +; VI-NEXT: v_add_f32_e64 v27, s44, 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: v_add_f32_e64 v30, s19, 1.0 -; VI-NEXT: v_add_f32_e64 v29, s18, 1.0 -; VI-NEXT: v_add_f32_e64 v24, s25, 1.0 -; VI-NEXT: v_add_f32_e64 v23, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v30, s47, 1.0 +; VI-NEXT: v_add_f32_e64 v29, s46, 1.0 +; VI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; VI-NEXT: v_add_f32_e64 v23, s40, 1.0 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_add_f32_e64 v32, s17, 1.0 -; VI-NEXT: v_add_f32_e64 v31, s16, 1.0 -; VI-NEXT: v_add_f32_e64 v26, s23, 1.0 -; VI-NEXT: v_add_f32_e64 v25, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v32, s57, 1.0 +; VI-NEXT: v_add_f32_e64 v31, s56, 1.0 +; VI-NEXT: v_add_f32_e64 v26, s43, 1.0 +; VI-NEXT: v_add_f32_e64 v25, s42, 1.0 ; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill @@ -46710,10 +46582,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v31 ; VI-NEXT: s_branch .LBB37_5 ; VI-NEXT: .LBB37_3: -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr69 ; VI-NEXT: ; implicit-def: $sgpr70 @@ -46750,126 +46622,126 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: -; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: v_mov_b32_e32 v53, s26 ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: v_mov_b32_e32 v53, s28 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 0 ; VI-NEXT: v_mov_b32_e32 v48, s4 @@ -47048,26 +46920,26 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 57 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: v_mov_b32_e32 v31, s16 -; VI-NEXT: v_mov_b32_e32 v32, s17 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 -; VI-NEXT: v_mov_b32_e32 v17, s44 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v16, s43 -; VI-NEXT: v_mov_b32_e32 v13, s40 -; VI-NEXT: v_mov_b32_e32 v14, s41 +; VI-NEXT: v_mov_b32_e32 v31, s56 +; VI-NEXT: v_mov_b32_e32 v32, s57 +; VI-NEXT: v_mov_b32_e32 v29, s46 +; VI-NEXT: v_mov_b32_e32 v30, s47 +; VI-NEXT: v_mov_b32_e32 v27, s44 +; VI-NEXT: v_mov_b32_e32 v28, s45 +; VI-NEXT: v_mov_b32_e32 v25, s42 +; VI-NEXT: v_mov_b32_e32 v26, s43 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s41 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v22, s25 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_mov_b32_e32 v20, s23 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 ; VI-NEXT: v_mov_b32_e32 v11, s14 ; VI-NEXT: v_mov_b32_e32 v12, s15 ; VI-NEXT: v_mov_b32_e32 v9, s12 @@ -47570,27 +47442,55 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_writelane_b32 v63, s67, 19 ; GFX9-NEXT: v_writelane_b32 v63, s68, 20 ; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 ; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_readfirstlane_b32 s56, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 ; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_readfirstlane_b32 s57, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 ; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_readfirstlane_b32 s46, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 ; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_readfirstlane_b32 s47, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 ; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_readfirstlane_b32 s44, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 ; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_readfirstlane_b32 s45, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s22 ; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_readfirstlane_b32 s42, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s23 ; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_readfirstlane_b32 s43, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 ; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_readfirstlane_b32 s40, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s25 ; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_readfirstlane_b32 s41, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s26 ; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_readfirstlane_b32 s24, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_readfirstlane_b32 s25, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 ; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s22, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GFX9-NEXT: v_writelane_b32 v63, s99, 35 -; GFX9-NEXT: v_readfirstlane_b32 s44, v1 -; GFX9-NEXT: v_readfirstlane_b32 s45, v2 -; GFX9-NEXT: v_readfirstlane_b32 s42, v3 -; GFX9-NEXT: v_readfirstlane_b32 s43, v4 -; GFX9-NEXT: v_readfirstlane_b32 s40, v5 -; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s23, v20 +; GFX9-NEXT: v_readfirstlane_b32 s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s21, v2 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 ; GFX9-NEXT: v_readfirstlane_b32 s14, v7 ; GFX9-NEXT: v_readfirstlane_b32 s15, v8 ; GFX9-NEXT: v_readfirstlane_b32 s12, v9 @@ -47602,7 +47502,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readfirstlane_b32 s6, v15 ; GFX9-NEXT: v_readfirstlane_b32 s7, v16 ; GFX9-NEXT: v_readfirstlane_b32 s4, v17 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v18 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -47621,152 +47521,152 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 1 -; GFX9-NEXT: s_lshr_b32 s46, s28, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 0 -; GFX9-NEXT: s_lshr_b32 s82, s27, 24 -; GFX9-NEXT: s_lshr_b32 s83, s27, 16 -; GFX9-NEXT: s_lshr_b32 s85, s27, 8 -; GFX9-NEXT: s_lshr_b32 s84, s26, 16 -; GFX9-NEXT: s_lshr_b32 s86, s26, 8 -; GFX9-NEXT: s_lshr_b32 s87, s25, 24 -; GFX9-NEXT: s_lshr_b32 s96, s25, 16 -; GFX9-NEXT: s_lshr_b32 s98, s25, 8 -; GFX9-NEXT: s_lshr_b32 s97, s24, 16 -; GFX9-NEXT: s_lshr_b32 s99, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s39, s23, 16 -; GFX9-NEXT: s_lshr_b32 s49, s23, 8 -; GFX9-NEXT: s_lshr_b32 s48, s22, 16 -; GFX9-NEXT: s_lshr_b32 s50, s22, 8 -; GFX9-NEXT: s_lshr_b32 s51, s21, 24 -; GFX9-NEXT: s_lshr_b32 s52, s21, 16 -; GFX9-NEXT: s_lshr_b32 s54, s21, 8 -; GFX9-NEXT: s_lshr_b32 s53, s20, 16 -; GFX9-NEXT: s_lshr_b32 s55, s20, 8 -; GFX9-NEXT: s_lshr_b32 s64, s19, 24 -; GFX9-NEXT: s_lshr_b32 s65, s19, 16 -; GFX9-NEXT: s_lshr_b32 s67, s19, 8 -; GFX9-NEXT: s_lshr_b32 s66, s18, 16 -; GFX9-NEXT: s_lshr_b32 s68, s18, 8 -; GFX9-NEXT: s_lshr_b32 s69, s17, 24 -; GFX9-NEXT: s_lshr_b32 s70, s17, 16 -; GFX9-NEXT: s_lshr_b32 s80, s17, 8 -; GFX9-NEXT: s_lshr_b32 s71, s16, 16 -; GFX9-NEXT: s_lshr_b32 s81, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 1 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b32 s82, s25, 24 +; GFX9-NEXT: s_lshr_b32 s83, s25, 16 +; GFX9-NEXT: s_lshr_b32 s85, s25, 8 +; GFX9-NEXT: s_lshr_b32 s84, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s98, s41, 8 +; GFX9-NEXT: s_lshr_b32 s97, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s49, s43, 8 +; GFX9-NEXT: s_lshr_b32 s48, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s54, s45, 8 +; GFX9-NEXT: s_lshr_b32 s53, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s67, s47, 8 +; GFX9-NEXT: s_lshr_b32 s66, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s80, s57, 8 +; GFX9-NEXT: s_lshr_b32 s71, s56, 16 +; GFX9-NEXT: s_lshr_b32 s81, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB37_4 ; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 @@ -47802,44 +47702,44 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: v_add_f32_e64 v14, s41, 1.0 -; GFX9-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX9-NEXT: v_add_f32_e64 v14, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s16, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: v_add_f32_e64 v23, s43, 1.0 -; GFX9-NEXT: v_add_f32_e64 v22, s42, 1.0 +; GFX9-NEXT: v_add_f32_e64 v23, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v22, s18, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[22:23] -; GFX9-NEXT: v_add_f32_e64 v25, s45, 1.0 -; GFX9-NEXT: v_add_f32_e64 v24, s44, 1.0 +; GFX9-NEXT: v_add_f32_e64 v25, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v24, s20, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[24:25] -; GFX9-NEXT: v_add_f32_e64 v27, s29, 1.0 -; GFX9-NEXT: v_add_f32_e64 v26, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v27, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v26, s22, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[26:27] -; GFX9-NEXT: v_add_f32_e64 v29, s27, 1.0 -; GFX9-NEXT: v_add_f32_e64 v28, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v29, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v28, s24, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[28:29] -; GFX9-NEXT: v_add_f32_e64 v31, s25, 1.0 -; GFX9-NEXT: v_add_f32_e64 v30, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v31, s41, 1.0 +; GFX9-NEXT: v_add_f32_e64 v30, s40, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[30:31] -; GFX9-NEXT: v_add_f32_e64 v33, s23, 1.0 -; GFX9-NEXT: v_add_f32_e64 v32, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v33, s43, 1.0 +; GFX9-NEXT: v_add_f32_e64 v32, s42, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill @@ -47946,17 +47846,17 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: v_add_f32_e64 v35, s21, 1.0 -; GFX9-NEXT: v_add_f32_e64 v34, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v35, s45, 1.0 +; GFX9-NEXT: v_add_f32_e64 v34, s44, 1.0 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v33 -; GFX9-NEXT: v_add_f32_e64 v37, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v37, s47, 1.0 +; GFX9-NEXT: v_add_f32_e64 v36, s46, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[34:35] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 -; GFX9-NEXT: v_add_f32_e64 v39, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v39, s57, 1.0 +; GFX9-NEXT: v_add_f32_e64 v38, s56, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[36:37] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 @@ -47992,10 +47892,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v38 ; GFX9-NEXT: s_branch .LBB37_5 ; GFX9-NEXT: .LBB37_3: -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr80 @@ -48040,104 +47940,104 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB37_2 ; GFX9-NEXT: .LBB37_4: ; GFX9-NEXT: v_mov_b32_e32 v52, s48 @@ -48303,11 +48203,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s4 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: v_mov_b32_e32 v40, s26 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: v_mov_b32_e32 v40, s28 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -48356,26 +48256,26 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, s16 -; GFX9-NEXT: v_mov_b32_e32 v39, s17 -; GFX9-NEXT: v_mov_b32_e32 v36, s18 -; GFX9-NEXT: v_mov_b32_e32 v37, s19 -; GFX9-NEXT: v_mov_b32_e32 v34, s20 -; GFX9-NEXT: v_mov_b32_e32 v35, s21 -; GFX9-NEXT: v_mov_b32_e32 v32, s22 -; GFX9-NEXT: v_mov_b32_e32 v33, s23 -; GFX9-NEXT: v_mov_b32_e32 v30, s24 -; GFX9-NEXT: v_mov_b32_e32 v31, s25 -; GFX9-NEXT: v_mov_b32_e32 v28, s26 -; GFX9-NEXT: v_mov_b32_e32 v29, s27 -; GFX9-NEXT: v_mov_b32_e32 v26, s28 -; GFX9-NEXT: v_mov_b32_e32 v27, s29 -; GFX9-NEXT: v_mov_b32_e32 v24, s44 -; GFX9-NEXT: v_mov_b32_e32 v25, s45 -; GFX9-NEXT: v_mov_b32_e32 v22, s42 -; GFX9-NEXT: v_mov_b32_e32 v23, s43 -; GFX9-NEXT: v_mov_b32_e32 v13, s40 -; GFX9-NEXT: v_mov_b32_e32 v14, s41 +; GFX9-NEXT: v_mov_b32_e32 v38, s56 +; GFX9-NEXT: v_mov_b32_e32 v39, s57 +; GFX9-NEXT: v_mov_b32_e32 v36, s46 +; GFX9-NEXT: v_mov_b32_e32 v37, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s44 +; GFX9-NEXT: v_mov_b32_e32 v35, s45 +; GFX9-NEXT: v_mov_b32_e32 v32, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s43 +; GFX9-NEXT: v_mov_b32_e32 v30, s40 +; GFX9-NEXT: v_mov_b32_e32 v31, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v25, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s18 +; GFX9-NEXT: v_mov_b32_e32 v23, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 ; GFX9-NEXT: v_mov_b32_e32 v11, s14 ; GFX9-NEXT: v_mov_b32_e32 v12, s15 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 @@ -48833,32 +48733,70 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v75, s30, 0 ; GFX11-NEXT: v_writelane_b32 v76, s96, 0 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s40, v1 -; GFX11-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: v_writelane_b32 v75, s31, 1 ; GFX11-NEXT: v_writelane_b32 v76, s97, 1 -; GFX11-NEXT: v_readfirstlane_b32 s14, v3 -; GFX11-NEXT: v_readfirstlane_b32 s15, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 ; GFX11-NEXT: v_writelane_b32 v75, s34, 2 ; GFX11-NEXT: v_writelane_b32 v76, s98, 2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v6 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21 +; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23 ; GFX11-NEXT: v_writelane_b32 v75, s35, 3 ; GFX11-NEXT: v_writelane_b32 v76, s99, 3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v9 -; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25 +; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27 ; GFX11-NEXT: v_writelane_b32 v75, s36, 4 ; GFX11-NEXT: v_writelane_b32 v76, s100, 4 -; GFX11-NEXT: v_readfirstlane_b32 s11, v12 -; GFX11-NEXT: v_readfirstlane_b32 s12, v13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v14 +; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_writelane_b32 v75, s37, 5 ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: v_readfirstlane_b32 s29, v19 +; GFX11-NEXT: v_readfirstlane_b32 s26, v20 +; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-NEXT: v_readfirstlane_b32 s24, v22 +; GFX11-NEXT: v_readfirstlane_b32 s25, v23 +; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s20, v26 +; GFX11-NEXT: v_readfirstlane_b32 s21, v27 +; GFX11-NEXT: v_readfirstlane_b32 s18, v28 +; GFX11-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-NEXT: v_readfirstlane_b32 s19, v29 +; GFX11-NEXT: v_readfirstlane_b32 s16, v30 +; GFX11-NEXT: v_readfirstlane_b32 s17, v31 +; GFX11-NEXT: v_readfirstlane_b32 s14, v32 +; GFX11-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-NEXT: v_readfirstlane_b32 s15, v33 +; GFX11-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v13 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-NEXT: v_writelane_b32 v75, s54, 14 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo ; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 @@ -48880,20 +48818,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: v_writelane_b32 v76, s104, 8 ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 ; GFX11-NEXT: v_writelane_b32 v75, s55, 15 ; GFX11-NEXT: v_writelane_b32 v75, s64, 16 ; GFX11-NEXT: v_writelane_b32 v75, s65, 17 @@ -48913,190 +48840,190 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_writelane_b32 v75, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s42, s13, 24 -; GFX11-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-NEXT: s_lshr_b32 s36, s17, 16 ; GFX11-NEXT: v_writelane_b32 v78, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s38, s27, 8 -; GFX11-NEXT: s_lshr_b32 s37, s26, 16 -; GFX11-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s38, s17, 8 +; GFX11-NEXT: s_lshr_b32 s37, s16, 16 +; GFX11-NEXT: s_lshr_b32 s39, s16, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s13, 8 -; GFX11-NEXT: s_lshr_b32 s48, s25, 24 -; GFX11-NEXT: s_lshr_b32 s49, s25, 16 -; GFX11-NEXT: s_lshr_b32 s51, s25, 8 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 s48, s19, 24 +; GFX11-NEXT: s_lshr_b32 s49, s19, 16 +; GFX11-NEXT: s_lshr_b32 s51, s19, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s12, 16 -; GFX11-NEXT: s_lshr_b32 s50, s24, 16 -; GFX11-NEXT: s_lshr_b32 s52, s24, 8 -; GFX11-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b32 s50, s18, 16 +; GFX11-NEXT: s_lshr_b32 s52, s18, 8 +; GFX11-NEXT: s_lshr_b32 s53, s21, 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s12, 8 -; GFX11-NEXT: s_lshr_b32 s54, s23, 16 -; GFX11-NEXT: s_lshr_b32 s64, s23, 8 -; GFX11-NEXT: s_lshr_b32 s55, s22, 16 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-NEXT: s_lshr_b32 s54, s21, 16 +; GFX11-NEXT: s_lshr_b32 s64, s21, 8 +; GFX11-NEXT: s_lshr_b32 s55, s20, 16 ; GFX11-NEXT: v_writelane_b32 v78, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s11, 24 -; GFX11-NEXT: s_lshr_b32 s65, s22, 8 -; GFX11-NEXT: s_lshr_b32 s66, s21, 24 -; GFX11-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s65, s20, 8 +; GFX11-NEXT: s_lshr_b32 s66, s23, 24 +; GFX11-NEXT: s_lshr_b32 s67, s23, 16 ; GFX11-NEXT: v_writelane_b32 v78, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s11, 16 -; GFX11-NEXT: s_lshr_b32 s69, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s69, s23, 8 +; GFX11-NEXT: s_lshr_b32 s68, s22, 16 +; GFX11-NEXT: s_lshr_b32 s70, s22, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s11, 8 -; GFX11-NEXT: s_lshr_b32 s71, s19, 24 -; GFX11-NEXT: s_lshr_b32 s80, s19, 16 -; GFX11-NEXT: s_lshr_b32 s82, s19, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s71, s25, 24 +; GFX11-NEXT: s_lshr_b32 s80, s25, 16 +; GFX11-NEXT: s_lshr_b32 s82, s25, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s10, 16 -; GFX11-NEXT: s_lshr_b32 s81, s18, 16 -; GFX11-NEXT: s_lshr_b32 s83, s18, 8 -; GFX11-NEXT: s_lshr_b32 s84, s17, 24 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_lshr_b32 s81, s24, 16 +; GFX11-NEXT: s_lshr_b32 s83, s24, 8 +; GFX11-NEXT: s_lshr_b32 s84, s27, 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s10, 8 -; GFX11-NEXT: s_lshr_b32 s85, s17, 16 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_lshr_b32 s85, s27, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s9, 24 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: s_lshr_b32 s86, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-NEXT: s_lshr_b32 s86, s26, 16 +; GFX11-NEXT: s_lshr_b32 s96, s26, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s97, s3, 24 -; GFX11-NEXT: s_lshr_b32 s98, s3, 16 -; GFX11-NEXT: s_lshr_b32 s100, s3, 8 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s97, s29, 24 +; GFX11-NEXT: s_lshr_b32 s98, s29, 16 +; GFX11-NEXT: s_lshr_b32 s100, s29, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s9, 8 -; GFX11-NEXT: s_lshr_b32 s99, s2, 16 -; GFX11-NEXT: s_lshr_b32 s101, s2, 8 -; GFX11-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s99, s28, 16 +; GFX11-NEXT: s_lshr_b32 s101, s28, 8 +; GFX11-NEXT: s_lshr_b32 s102, s41, 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s8, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 16 -; GFX11-NEXT: s_lshr_b32 s34, s1, 8 -; GFX11-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s103, s41, 16 +; GFX11-NEXT: s_lshr_b32 s34, s41, 8 +; GFX11-NEXT: s_lshr_b32 s104, s40, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s8, 8 -; GFX11-NEXT: s_lshr_b32 s35, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s35, s40, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[6:7], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s7, 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[0:1], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s7, 16 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s7, 8 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s6, 16 -; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s6, 8 -; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s42, s0, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 15 -; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 14 -; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 13 -; GFX11-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 12 -; GFX11-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 11 -; GFX11-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 10 -; GFX11-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 9 -; GFX11-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-NEXT: s_lshr_b32 s42, s12, 16 ; GFX11-NEXT: v_writelane_b32 v77, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-NEXT: s_lshr_b32 s42, s12, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 ; GFX11-NEXT: v_writelane_b32 v77, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-NEXT: s_lshr_b32 s42, s14, 8 ; GFX11-NEXT: v_writelane_b32 v77, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v77, s42, 0 -; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi ; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 ; GFX11-NEXT: .LBB37_2: ; %cmp.true -; GFX11-NEXT: v_add_f32_e64 v22, s27, 1.0 -; GFX11-NEXT: v_add_f32_e64 v21, s26, 1.0 -; GFX11-NEXT: v_add_f32_e64 v24, s25, 1.0 -; GFX11-NEXT: v_add_f32_e64 v23, s24, 1.0 -; GFX11-NEXT: v_add_f32_e64 v29, s23, 1.0 -; GFX11-NEXT: v_add_f32_e64 v28, s22, 1.0 -; GFX11-NEXT: v_add_f32_e64 v31, s21, 1.0 -; GFX11-NEXT: v_add_f32_e64 v30, s20, 1.0 -; GFX11-NEXT: v_add_f32_e64 v35, s19, 1.0 -; GFX11-NEXT: v_add_f32_e64 v34, s18, 1.0 -; GFX11-NEXT: v_add_f32_e64 v37, s17, 1.0 -; GFX11-NEXT: v_add_f32_e64 v36, s16, 1.0 -; GFX11-NEXT: v_add_f32_e64 v6, s9, 1.0 -; GFX11-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX11-NEXT: v_add_f32_e64 v22, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v21, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v24, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v23, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v29, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v28, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v31, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v30, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v35, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v34, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v37, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v36, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s4, 1.0 ; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[21:22] -; GFX11-NEXT: v_add_f32_e64 v53, s1, 1.0 -; GFX11-NEXT: v_add_f32_e64 v52, s0, 1.0 -; GFX11-NEXT: v_add_f32_e64 v49, s3, 1.0 -; GFX11-NEXT: v_add_f32_e64 v48, s2, 1.0 -; GFX11-NEXT: v_add_f32_e64 v18, s29, 1.0 -; GFX11-NEXT: v_add_f32_e64 v17, s28, 1.0 -; GFX11-NEXT: v_add_f32_e64 v14, s41, 1.0 -; GFX11-NEXT: v_add_f32_e64 v13, s40, 1.0 -; GFX11-NEXT: v_add_f32_e64 v12, s15, 1.0 -; GFX11-NEXT: v_add_f32_e64 v11, s14, 1.0 -; GFX11-NEXT: v_add_f32_e64 v10, s5, 1.0 -; GFX11-NEXT: v_add_f32_e64 v8, s7, 1.0 -; GFX11-NEXT: v_add_f32_e64 v4, s11, 1.0 -; GFX11-NEXT: v_add_f32_e64 v2, s13, 1.0 -; GFX11-NEXT: v_add_f32_e64 v1, s12, 1.0 -; GFX11-NEXT: v_add_f32_e64 v3, s10, 1.0 -; GFX11-NEXT: v_add_f32_e64 v7, s6, 1.0 -; GFX11-NEXT: v_add_f32_e64 v9, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v53, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v52, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v49, s29, 1.0 +; GFX11-NEXT: v_add_f32_e64 v48, s28, 1.0 +; GFX11-NEXT: v_add_f32_e64 v18, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v17, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s12, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s11, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s10, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s9, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s8, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s0, 1.0 ; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[23:24] ; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[28:29] ; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[30:31] @@ -49333,41 +49260,41 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: s_branch .LBB37_2 ; GFX11-NEXT: .LBB37_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: v_readlane_b32 s0, v77, 0 -; GFX11-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v48, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s3 :: v_dual_mov_b32 v36, s16 +; GFX11-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v52, s40 +; GFX11-NEXT: v_dual_mov_b32 v53, s41 :: v_dual_mov_b32 v48, s28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v37, s17 :: v_dual_mov_b32 v148, s0 +; GFX11-NEXT: v_dual_mov_b32 v49, s29 :: v_dual_mov_b32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 1 -; GFX11-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 -; GFX11-NEXT: v_dual_mov_b32 v30, s20 :: v_dual_mov_b32 v31, s21 +; GFX11-NEXT: v_dual_mov_b32 v36, s26 :: v_dual_mov_b32 v37, s27 +; GFX11-NEXT: v_dual_mov_b32 v34, s24 :: v_dual_mov_b32 v35, s25 ; GFX11-NEXT: v_mov_b32_e32 v146, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 2 -; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 -; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: v_dual_mov_b32 v30, s22 :: v_dual_mov_b32 v31, s23 +; GFX11-NEXT: v_dual_mov_b32 v28, s20 :: v_dual_mov_b32 v29, s21 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v145, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 3 -; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 -; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 +; GFX11-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 +; GFX11-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 ; GFX11-NEXT: v_mov_b32_e32 v144, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 4 -; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 -; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-NEXT: v_dual_mov_b32 v17, s14 :: v_dual_mov_b32 v18, s15 +; GFX11-NEXT: v_dual_mov_b32 v13, s12 :: v_dual_mov_b32 v14, s13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v134, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 5 -; GFX11-NEXT: v_dual_mov_b32 v9, s4 :: v_dual_mov_b32 v10, s5 -; GFX11-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v8, s7 +; GFX11-NEXT: v_dual_mov_b32 v11, s10 :: v_dual_mov_b32 v12, s11 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 ; GFX11-NEXT: v_mov_b32_e32 v135, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 6 -; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 -; GFX11-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v6, s5 +; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v133, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 -; GFX11-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v2, s13 +; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 ; GFX11-NEXT: v_dual_mov_b32 v74, s35 :: v_dual_mov_b32 v73, s104 ; GFX11-NEXT: v_mov_b32_e32 v132, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 @@ -61719,31 +61646,59 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s6, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s7, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s10, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s12, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s14, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s8, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s9, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s11, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s13, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s15, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s16, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s17, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s18, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s19, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s22, v3 +; SI-NEXT: v_readfirstlane_b32 s23, v4 +; SI-NEXT: v_readfirstlane_b32 s24, v5 +; SI-NEXT: v_readfirstlane_b32 s25, v6 +; SI-NEXT: v_readfirstlane_b32 s26, v7 +; SI-NEXT: v_readfirstlane_b32 s27, v8 +; SI-NEXT: v_readfirstlane_b32 s28, v9 +; SI-NEXT: v_readfirstlane_b32 s29, v10 ; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_readfirstlane_b32 s41, v12 ; SI-NEXT: v_readfirstlane_b32 s42, v13 @@ -61790,86 +61745,86 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_lshl_b32 s77, s41, 16 ; SI-NEXT: s_and_b32 s78, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s79, s40, 16 -; SI-NEXT: s_and_b32 s88, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s15, 16 -; SI-NEXT: s_and_b32 s90, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s91, s14, 16 -; SI-NEXT: s_and_b32 s92, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s93, s13, 16 -; SI-NEXT: s_and_b32 s94, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s12, 16 -; SI-NEXT: s_and_b32 s30, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s31, s11, 16 -; SI-NEXT: s_and_b32 s34, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s35, s10, 16 -; SI-NEXT: s_and_b32 s36, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s37, s9, 16 -; SI-NEXT: s_and_b32 s38, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s39, s8, 16 -; SI-NEXT: s_and_b32 s48, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s49, s7, 16 -; SI-NEXT: s_and_b32 s50, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s51, s6, 16 -; SI-NEXT: s_and_b32 s52, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s29, 16 -; SI-NEXT: s_and_b32 s54, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s55, s28, 16 -; SI-NEXT: s_and_b32 s64, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s65, s27, 16 -; SI-NEXT: s_and_b32 s66, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s26, 16 -; SI-NEXT: s_and_b32 s68, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s69, s25, 16 -; SI-NEXT: s_and_b32 s70, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s24, 16 -; SI-NEXT: s_and_b32 s80, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s23, 16 -; SI-NEXT: s_and_b32 s82, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s83, s22, 16 -; SI-NEXT: s_and_b32 s84, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s21, 16 -; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s87, s20, 16 -; SI-NEXT: s_and_b32 s96, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s19, 16 -; SI-NEXT: s_and_b32 s98, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s99, s18, 16 -; SI-NEXT: s_and_b32 s56, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s17, 16 -; SI-NEXT: s_and_b32 s58, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s16, 16 +; SI-NEXT: s_and_b32 s88, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s29, 16 +; SI-NEXT: s_and_b32 s90, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s28, 16 +; SI-NEXT: s_and_b32 s92, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s27, 16 +; SI-NEXT: s_and_b32 s94, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s26, 16 +; SI-NEXT: s_and_b32 s30, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s25, 16 +; SI-NEXT: s_and_b32 s34, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s24, 16 +; SI-NEXT: s_and_b32 s36, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s23, 16 +; SI-NEXT: s_and_b32 s38, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s22, 16 +; SI-NEXT: s_and_b32 s48, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s21, 16 +; SI-NEXT: s_and_b32 s50, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s20, 16 +; SI-NEXT: s_and_b32 s52, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s19, 16 +; SI-NEXT: s_and_b32 s54, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s18, 16 +; SI-NEXT: s_and_b32 s64, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s17, 16 +; SI-NEXT: s_and_b32 s66, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s16, 16 +; SI-NEXT: s_and_b32 s68, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s15, 16 +; SI-NEXT: s_and_b32 s70, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s13, 16 +; SI-NEXT: s_and_b32 s80, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s11, 16 +; SI-NEXT: s_and_b32 s82, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s9, 16 +; SI-NEXT: s_and_b32 s84, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s8, 16 +; SI-NEXT: s_and_b32 s86, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s14, 16 +; SI-NEXT: s_and_b32 s96, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s12, 16 +; SI-NEXT: s_and_b32 s98, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s10, 16 +; SI-NEXT: s_and_b32 s56, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s7, 16 +; SI-NEXT: s_and_b32 s58, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s6, 16 ; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s12, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s47, 1.0 -; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s10, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s46, 1.0 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v43, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v41, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v55, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v53, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v51, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v49, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v39, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v43, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v41, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v55, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v53, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v51, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s29, 1.0 ; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 ; SI-NEXT: v_add_f32_e64 v13, s41, 1.0 ; SI-NEXT: v_add_f32_e64 v11, s42, 1.0 @@ -61885,7 +61840,7 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s6, 1.0 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -70452,25 +70407,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -70487,484 +70423,653 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v42, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v43, s17 +; SI-NEXT: v_mov_b32_e32 v41, s18 +; SI-NEXT: v_mov_b32_e32 v40, s19 +; SI-NEXT: v_mov_b32_e32 v55, s20 +; SI-NEXT: v_mov_b32_e32 v54, s21 +; SI-NEXT: v_mov_b32_e32 v53, s22 +; SI-NEXT: v_mov_b32_e32 v52, s23 +; SI-NEXT: v_mov_b32_e32 v51, s24 +; SI-NEXT: v_mov_b32_e32 v50, s25 +; SI-NEXT: v_mov_b32_e32 v49, s26 +; SI-NEXT: v_mov_b32_e32 v39, s27 +; SI-NEXT: v_mov_b32_e32 v38, s28 +; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v39 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v43 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v42 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 -; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v43 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v42 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v40 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v37 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v38 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -70986,72 +71091,103 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -75868,23 +76004,51 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 ; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 ; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 ; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 ; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 ; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 ; VI-NEXT: v_readfirstlane_b32 s15, v11 -; VI-NEXT: v_readfirstlane_b32 s40, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v13 -; VI-NEXT: v_readfirstlane_b32 s42, v14 -; VI-NEXT: v_readfirstlane_b32 s43, v15 -; VI-NEXT: v_readfirstlane_b32 s44, v16 -; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s17, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: v_readfirstlane_b32 s24, v3 +; VI-NEXT: v_readfirstlane_b32 s25, v4 +; VI-NEXT: v_readfirstlane_b32 s26, v5 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_readfirstlane_b32 s43, v12 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_readfirstlane_b32 s45, v14 ; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s47, v1 @@ -75901,8 +76065,38 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s5, s45, 3 ; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -75971,38 +76165,8 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s45, 3 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s44, 3 -; VI-NEXT: s_add_i32 s45, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s43, 3 -; VI-NEXT: s_add_i32 s44, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s42, 3 -; VI-NEXT: s_add_i32 s43, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s41, 3 -; VI-NEXT: s_add_i32 s42, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s40, 3 -; VI-NEXT: s_add_i32 s41, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s15, 3 -; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -76053,20 +76217,20 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB51_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s22 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s24 +; VI-NEXT: v_mov_b32_e32 v3, s25 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: v_mov_b32_e32 v6, s28 +; VI-NEXT: v_mov_b32_e32 v7, s29 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s41 +; VI-NEXT: v_mov_b32_e32 v10, s42 +; VI-NEXT: v_mov_b32_e32 v11, s43 +; VI-NEXT: v_mov_b32_e32 v12, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s46 ; VI-NEXT: v_mov_b32_e32 v15, s47 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -76079,12 +76243,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v23, s13 ; VI-NEXT: v_mov_b32_e32 v24, s14 ; VI-NEXT: v_mov_b32_e32 v25, s15 -; VI-NEXT: v_mov_b32_e32 v26, s40 -; VI-NEXT: v_mov_b32_e32 v27, s41 -; VI-NEXT: v_mov_b32_e32 v28, s42 -; VI-NEXT: v_mov_b32_e32 v29, s43 -; VI-NEXT: v_mov_b32_e32 v30, s44 -; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: v_mov_b32_e32 v26, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v28, s18 +; VI-NEXT: v_mov_b32_e32 v29, s19 +; VI-NEXT: v_mov_b32_e32 v30, s20 +; VI-NEXT: v_mov_b32_e32 v31, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: ; VI-NEXT: s_branch .LBB51_2 @@ -81779,53 +81943,81 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s22, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_writelane_b32 v21, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s23, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: v_readfirstlane_b32 s14, v7 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 @@ -81837,192 +82029,192 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_writelane_b32 v21, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s46, 40 -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 41 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s9, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 38 -; SI-NEXT: v_writelane_b32 v22, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 36 -; SI-NEXT: v_writelane_b32 v22, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 34 -; SI-NEXT: v_writelane_b32 v22, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 32 -; SI-NEXT: v_writelane_b32 v22, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 30 -; SI-NEXT: v_writelane_b32 v22, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 28 -; SI-NEXT: v_writelane_b32 v22, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 26 -; SI-NEXT: v_writelane_b32 v22, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 24 -; SI-NEXT: v_writelane_b32 v22, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 22 -; SI-NEXT: v_writelane_b32 v22, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 20 -; SI-NEXT: v_writelane_b32 v22, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 18 -; SI-NEXT: v_writelane_b32 v22, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 16 -; SI-NEXT: v_writelane_b32 v22, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 14 -; SI-NEXT: v_writelane_b32 v22, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 12 -; SI-NEXT: v_writelane_b32 v22, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 10 -; SI-NEXT: v_writelane_b32 v22, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 6 -; SI-NEXT: v_writelane_b32 v22, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 4 -; SI-NEXT: v_writelane_b32 v22, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 2 -; SI-NEXT: v_writelane_b32 v22, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 0 -; SI-NEXT: s_lshr_b32 s49, s19, 24 -; SI-NEXT: s_lshr_b32 s48, s19, 16 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b32 s51, s17, 24 -; SI-NEXT: s_lshr_b32 s52, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 8 +; SI-NEXT: s_lshr_b32 s26, s5, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 41 +; SI-NEXT: s_lshr_b32 s26, s5, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: s_lshr_b32 s26, s7, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 43 +; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 45 +; SI-NEXT: s_lshr_b32 s26, s9, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 47 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 49 +; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 50 +; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 51 +; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 53 +; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 55 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 57 +; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 59 +; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 61 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 63 +; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 0 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 1 +; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 2 +; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 3 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 4 +; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 5 +; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 6 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 7 +; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 8 +; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 9 +; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 10 +; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 11 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 12 +; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 13 +; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 14 +; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 15 +; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 16 +; SI-NEXT: s_lshr_b32 s26, s45, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 38 +; SI-NEXT: v_writelane_b32 v23, s27, 39 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 36 +; SI-NEXT: v_writelane_b32 v23, s27, 37 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 34 +; SI-NEXT: v_writelane_b32 v23, s27, 35 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 32 +; SI-NEXT: v_writelane_b32 v23, s27, 33 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 30 +; SI-NEXT: v_writelane_b32 v23, s27, 31 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 28 +; SI-NEXT: v_writelane_b32 v23, s27, 29 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 26 +; SI-NEXT: v_writelane_b32 v23, s27, 27 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 24 +; SI-NEXT: v_writelane_b32 v23, s27, 25 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 22 +; SI-NEXT: v_writelane_b32 v23, s27, 23 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 20 +; SI-NEXT: v_writelane_b32 v23, s27, 21 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 18 +; SI-NEXT: v_writelane_b32 v23, s27, 19 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 16 +; SI-NEXT: v_writelane_b32 v23, s27, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 14 +; SI-NEXT: v_writelane_b32 v23, s27, 15 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 12 +; SI-NEXT: v_writelane_b32 v23, s27, 13 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 10 +; SI-NEXT: v_writelane_b32 v23, s27, 11 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 9 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 6 +; SI-NEXT: v_writelane_b32 v23, s27, 7 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 4 +; SI-NEXT: v_writelane_b32 v23, s27, 5 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 2 +; SI-NEXT: v_writelane_b32 v23, s27, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 0 +; SI-NEXT: s_lshr_b32 s49, s47, 24 +; SI-NEXT: s_lshr_b32 s48, s47, 16 +; SI-NEXT: s_lshr_b32 s50, s47, 8 +; SI-NEXT: s_lshr_b32 s51, s57, 24 +; SI-NEXT: s_lshr_b32 s52, s57, 16 +; SI-NEXT: s_lshr_b32 s53, s57, 8 ; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v22, s47, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v23, s27, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -82037,466 +82229,466 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_add_u32 s40, s40, 3 ; SI-NEXT: s_addc_u32 s41, s41, 0 ; SI-NEXT: s_add_u32 s42, s42, 3 ; SI-NEXT: s_addc_u32 s43, s43, 0 ; SI-NEXT: s_add_u32 s44, s44, 3 ; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 40 -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 41 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s9, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_writelane_b32 v22, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v22, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v21, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v21, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 8 -; SI-NEXT: v_writelane_b32 v21, s46, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 38 -; SI-NEXT: v_writelane_b32 v22, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 36 -; SI-NEXT: v_writelane_b32 v22, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 34 -; SI-NEXT: v_writelane_b32 v22, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 32 -; SI-NEXT: v_writelane_b32 v22, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 30 -; SI-NEXT: v_writelane_b32 v22, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 28 -; SI-NEXT: v_writelane_b32 v22, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 26 -; SI-NEXT: v_writelane_b32 v22, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 24 -; SI-NEXT: v_writelane_b32 v22, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 22 -; SI-NEXT: v_writelane_b32 v22, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 20 -; SI-NEXT: v_writelane_b32 v22, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 18 -; SI-NEXT: v_writelane_b32 v22, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 16 -; SI-NEXT: v_writelane_b32 v22, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 14 -; SI-NEXT: v_writelane_b32 v22, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 12 -; SI-NEXT: v_writelane_b32 v22, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 10 -; SI-NEXT: v_writelane_b32 v22, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 8 -; SI-NEXT: v_writelane_b32 v22, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 6 -; SI-NEXT: v_writelane_b32 v22, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v22, s46, 4 -; SI-NEXT: v_writelane_b32 v22, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v22, s46, 2 -; SI-NEXT: v_writelane_b32 v22, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v22, s46, 0 -; SI-NEXT: s_lshr_b32 s49, s19, 24 -; SI-NEXT: s_lshr_b32 s48, s19, 16 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b32 s51, s17, 24 -; SI-NEXT: s_lshr_b32 s52, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 8 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_add_u32 s56, s56, 3 +; SI-NEXT: s_addc_u32 s57, s57, 0 +; SI-NEXT: s_lshr_b32 s26, s5, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 41 +; SI-NEXT: s_lshr_b32 s26, s5, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: s_lshr_b32 s26, s7, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 43 +; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 45 +; SI-NEXT: s_lshr_b32 s26, s9, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 47 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 49 +; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 50 +; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 51 +; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 53 +; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 55 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 57 +; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 59 +; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: v_writelane_b32 v23, s26, 61 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: v_writelane_b32 v23, s26, 63 +; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 0 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 1 +; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 2 +; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 3 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 4 +; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 5 +; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 6 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 7 +; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 8 +; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 9 +; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 10 +; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 11 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 12 +; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 13 +; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 14 +; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: v_writelane_b32 v22, s26, 15 +; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: v_writelane_b32 v22, s26, 16 +; SI-NEXT: s_lshr_b32 s26, s45, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 38 +; SI-NEXT: v_writelane_b32 v23, s27, 39 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 36 +; SI-NEXT: v_writelane_b32 v23, s27, 37 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 34 +; SI-NEXT: v_writelane_b32 v23, s27, 35 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 32 +; SI-NEXT: v_writelane_b32 v23, s27, 33 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 30 +; SI-NEXT: v_writelane_b32 v23, s27, 31 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 28 +; SI-NEXT: v_writelane_b32 v23, s27, 29 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 26 +; SI-NEXT: v_writelane_b32 v23, s27, 27 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 24 +; SI-NEXT: v_writelane_b32 v23, s27, 25 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 22 +; SI-NEXT: v_writelane_b32 v23, s27, 23 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 20 +; SI-NEXT: v_writelane_b32 v23, s27, 21 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 18 +; SI-NEXT: v_writelane_b32 v23, s27, 19 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 16 +; SI-NEXT: v_writelane_b32 v23, s27, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 14 +; SI-NEXT: v_writelane_b32 v23, s27, 15 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 12 +; SI-NEXT: v_writelane_b32 v23, s27, 13 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 10 +; SI-NEXT: v_writelane_b32 v23, s27, 11 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 8 +; SI-NEXT: v_writelane_b32 v23, s27, 9 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 6 +; SI-NEXT: v_writelane_b32 v23, s27, 7 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 4 +; SI-NEXT: v_writelane_b32 v23, s27, 5 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 2 +; SI-NEXT: v_writelane_b32 v23, s27, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 0 +; SI-NEXT: s_lshr_b32 s49, s47, 24 +; SI-NEXT: s_lshr_b32 s48, s47, 16 +; SI-NEXT: s_lshr_b32 s50, s47, 8 +; SI-NEXT: s_lshr_b32 s51, s57, 24 +; SI-NEXT: s_lshr_b32 s52, s57, 16 +; SI-NEXT: s_lshr_b32 s53, s57, 8 ; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v22, s47, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v23, s27, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_lshl_b32 s47, s38, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: s_and_b32 s47, s36, 0xff -; SI-NEXT: s_lshl_b32 s57, s34, 24 -; SI-NEXT: s_lshl_b32 s47, s47, 16 -; SI-NEXT: s_or_b32 s47, s57, s47 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s53, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s52, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s47, s51, 24 -; SI-NEXT: s_or_b32 s17, s47, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s30, 8 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s94, 0xff -; SI-NEXT: s_lshl_b32 s18, s92, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xff -; SI-NEXT: s_lshl_b32 s17, s50, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s48, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s49, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s90, 8 -; SI-NEXT: s_and_b32 s17, s20, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s88, 0xff -; SI-NEXT: s_lshl_b32 s18, s78, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 17 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 16 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 15 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: s_lshl_b32 s16, s76, 8 -; SI-NEXT: s_and_b32 s17, s22, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s74, 0xff -; SI-NEXT: s_lshl_b32 s18, s72, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 14 -; SI-NEXT: v_mov_b32_e32 v7, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 13 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 12 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: s_lshl_b32 s16, s62, 8 -; SI-NEXT: s_and_b32 s17, s24, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s60, 0xff -; SI-NEXT: s_lshl_b32 s18, s58, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 11 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 10 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 9 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: s_lshl_b32 s16, s56, 8 -; SI-NEXT: s_and_b32 s17, s26, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s46, 0xff -; SI-NEXT: s_lshl_b32 s18, s98, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 8 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 7 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 6 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: s_lshl_b32 s16, s96, 8 -; SI-NEXT: s_and_b32 s17, s28, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s86, 0xff -; SI-NEXT: s_lshl_b32 s18, s84, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 5 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 4 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 3 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: s_lshl_b32 s16, s82, 8 -; SI-NEXT: s_and_b32 s17, s44, 0xff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s80, 0xff -; SI-NEXT: s_lshl_b32 s18, s70, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 2 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v21, 1 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 0 +; SI-NEXT: s_lshl_b32 s27, s38, 8 +; SI-NEXT: s_and_b32 s29, s56, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s36, 0xff +; SI-NEXT: s_lshl_b32 s56, s34, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xff +; SI-NEXT: s_lshl_b32 s29, s53, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s52, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s56, s51, 24 +; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s30, 8 +; SI-NEXT: s_and_b32 s29, s46, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s94, 0xff +; SI-NEXT: s_lshl_b32 s46, s92, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xff +; SI-NEXT: s_lshl_b32 s29, s50, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s48, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s46, s49, 24 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s90, 8 +; SI-NEXT: s_and_b32 s29, s44, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s88, 0xff +; SI-NEXT: s_lshl_b32 s44, s78, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 17 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 16 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s44, v22, 15 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s44, s44, 24 +; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v6, s27 +; SI-NEXT: s_lshl_b32 s27, s76, 8 +; SI-NEXT: s_and_b32 s29, s42, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s74, 0xff +; SI-NEXT: s_lshl_b32 s42, s72, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 14 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 13 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s42, v22, 12 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s42, s42, 24 +; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v8, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 8 +; SI-NEXT: s_and_b32 s29, s40, 0xff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s60, 0xff +; SI-NEXT: s_lshl_b32 s40, s58, 24 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 11 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s29, v22, 10 +; SI-NEXT: s_and_b32 s29, s29, 0xff +; SI-NEXT: v_readlane_b32 s40, v22, 9 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s40, s40, 24 +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: s_lshl_b32 s27, s28, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_lshl_b32 s27, s98, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xff +; SI-NEXT: v_readlane_b32 s25, v22, 8 +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_readlane_b32 s25, v22, 7 +; SI-NEXT: s_and_b32 s25, s25, 0xff +; SI-NEXT: v_readlane_b32 s26, v22, 6 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_lshl_b32 s26, s26, 24 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: s_lshl_b32 s24, s96, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: s_and_b32 s24, s86, 0xff +; SI-NEXT: s_lshl_b32 s25, s84, 24 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xff +; SI-NEXT: v_readlane_b32 s23, v22, 5 +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_readlane_b32 s23, v22, 4 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: v_readlane_b32 s24, v22, 3 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_lshl_b32 s24, s24, 24 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: s_lshl_b32 s22, s82, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_and_b32 s22, s80, 0xff +; SI-NEXT: s_lshl_b32 s23, s70, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xff +; SI-NEXT: v_readlane_b32 s21, v22, 2 +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: v_readlane_b32 s21, v22, 1 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: v_readlane_b32 s22, v22, 0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: s_lshl_b32 s16, s68, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: s_lshl_b32 s20, s68, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s66, 0xff +; SI-NEXT: s_or_b32 s18, s18, s20 +; SI-NEXT: s_and_b32 s20, s66, 0xff ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s64, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s21, s64, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v22, 63 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: v_readlane_b32 s19, v23, 63 ; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 62 +; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 61 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_readlane_b32 s19, v23, 62 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: v_readlane_b32 s20, v23, 61 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s20, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v22, 3 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 4 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_readlane_b32 s18, v23, 0 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s19, v23, 1 +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_readlane_b32 s18, v23, 2 +; SI-NEXT: v_readlane_b32 s19, v23, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v23, 4 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s20, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 60 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v23, 60 ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 59 +; SI-NEXT: v_readlane_b32 s17, v23, 59 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 58 +; SI-NEXT: v_readlane_b32 s18, v23, 58 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -82506,16 +82698,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v22, 6 +; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v22, 7 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v22, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v22, 8 -; SI-NEXT: v_readlane_b32 s17, v22, 9 +; SI-NEXT: v_readlane_b32 s16, v23, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: v_readlane_b32 s18, v23, 10 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -82526,12 +82717,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 57 +; SI-NEXT: v_readlane_b32 s15, v23, 57 ; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v22, 56 +; SI-NEXT: v_readlane_b32 s15, v23, 56 ; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 55 +; SI-NEXT: v_readlane_b32 s16, v23, 55 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -82541,15 +82732,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v22, 12 +; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 13 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v22, 14 -; SI-NEXT: v_readlane_b32 s15, v22, 15 +; SI-NEXT: v_readlane_b32 s14, v23, 14 +; SI-NEXT: v_readlane_b32 s15, v23, 15 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: v_readlane_b32 s16, v23, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -82560,12 +82751,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 54 +; SI-NEXT: v_readlane_b32 s13, v23, 54 ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v22, 53 +; SI-NEXT: v_readlane_b32 s13, v23, 53 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 52 +; SI-NEXT: v_readlane_b32 s14, v23, 52 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -82575,15 +82766,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v22, 18 +; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 19 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v22, 20 -; SI-NEXT: v_readlane_b32 s13, v22, 21 +; SI-NEXT: v_readlane_b32 s12, v23, 20 +; SI-NEXT: v_readlane_b32 s13, v23, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: v_readlane_b32 s14, v23, 22 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -82594,12 +82785,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 51 +; SI-NEXT: v_readlane_b32 s11, v23, 51 ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v22, 50 +; SI-NEXT: v_readlane_b32 s11, v23, 50 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 49 +; SI-NEXT: v_readlane_b32 s12, v23, 49 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -82609,15 +82800,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v22, 24 +; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 25 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v22, 26 -; SI-NEXT: v_readlane_b32 s11, v22, 27 +; SI-NEXT: v_readlane_b32 s10, v23, 26 +; SI-NEXT: v_readlane_b32 s11, v23, 27 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: v_readlane_b32 s12, v23, 28 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -82628,12 +82819,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 48 +; SI-NEXT: v_readlane_b32 s9, v23, 48 ; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v22, 47 +; SI-NEXT: v_readlane_b32 s9, v23, 47 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: v_readlane_b32 s10, v22, 46 +; SI-NEXT: v_readlane_b32 s10, v23, 46 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -82643,15 +82834,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v22, 30 +; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 31 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v22, 32 -; SI-NEXT: v_readlane_b32 s9, v22, 33 +; SI-NEXT: v_readlane_b32 s8, v23, 32 +; SI-NEXT: v_readlane_b32 s9, v23, 33 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: v_readlane_b32 s10, v23, 34 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -82662,12 +82853,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v22, 45 +; SI-NEXT: v_readlane_b32 s7, v23, 45 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v22, 44 +; SI-NEXT: v_readlane_b32 s7, v23, 44 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 43 +; SI-NEXT: v_readlane_b32 s8, v23, 43 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -82677,13 +82868,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v22, 36 +; SI-NEXT: v_readlane_b32 s6, v23, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v22, 37 +; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s54, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 38 +; SI-NEXT: v_readlane_b32 s8, v23, 38 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -82694,12 +82885,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v22, 42 +; SI-NEXT: v_readlane_b32 s5, v23, 42 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v22, 41 +; SI-NEXT: v_readlane_b32 s5, v23, 41 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v22, 40 +; SI-NEXT: v_readlane_b32 s6, v23, 40 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -82709,69 +82900,70 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s19, v22, 11 -; SI-NEXT: v_readlane_b32 s17, v22, 17 -; SI-NEXT: v_readlane_b32 s15, v22, 23 -; SI-NEXT: v_readlane_b32 s13, v22, 29 -; SI-NEXT: v_readlane_b32 s11, v22, 35 -; SI-NEXT: v_readlane_b32 s9, v22, 39 +; SI-NEXT: v_readlane_b32 s21, v23, 5 +; SI-NEXT: v_readlane_b32 s19, v23, 11 +; SI-NEXT: v_readlane_b32 s17, v23, 17 +; SI-NEXT: v_readlane_b32 s15, v23, 23 +; SI-NEXT: v_readlane_b32 s13, v23, 29 +; SI-NEXT: v_readlane_b32 s11, v23, 35 +; SI-NEXT: v_readlane_b32 s9, v23, 39 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s99, v21, 35 +; SI-NEXT: v_readlane_b32 s98, v21, 34 +; SI-NEXT: v_readlane_b32 s97, v21, 33 +; SI-NEXT: v_readlane_b32 s96, v21, 32 +; SI-NEXT: v_readlane_b32 s87, v21, 31 +; SI-NEXT: v_readlane_b32 s86, v21, 30 +; SI-NEXT: v_readlane_b32 s85, v21, 29 +; SI-NEXT: v_readlane_b32 s84, v21, 28 +; SI-NEXT: v_readlane_b32 s83, v21, 27 +; SI-NEXT: v_readlane_b32 s82, v21, 26 +; SI-NEXT: v_readlane_b32 s81, v21, 25 +; SI-NEXT: v_readlane_b32 s80, v21, 24 +; SI-NEXT: v_readlane_b32 s71, v21, 23 +; SI-NEXT: v_readlane_b32 s70, v21, 22 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s54, 0 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s55, 1 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v23, s54, 0 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 1 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 @@ -82794,7 +82986,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -82805,139 +82997,139 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 2 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s55, 3 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 2 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 3 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 4 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s55, 5 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 4 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 5 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 6 -; SI-NEXT: v_writelane_b32 v22, s55, 7 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 6 +; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 8 -; SI-NEXT: v_writelane_b32 v22, s55, 9 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 8 +; SI-NEXT: v_writelane_b32 v23, s55, 9 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 10 -; SI-NEXT: v_writelane_b32 v22, s55, 11 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 10 +; SI-NEXT: v_writelane_b32 v23, s55, 11 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 12 -; SI-NEXT: v_writelane_b32 v22, s55, 13 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 12 +; SI-NEXT: v_writelane_b32 v23, s55, 13 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 14 -; SI-NEXT: v_writelane_b32 v22, s55, 15 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 14 +; SI-NEXT: v_writelane_b32 v23, s55, 15 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 16 -; SI-NEXT: v_writelane_b32 v22, s55, 17 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 16 +; SI-NEXT: v_writelane_b32 v23, s55, 17 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 18 -; SI-NEXT: v_writelane_b32 v22, s55, 19 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 18 +; SI-NEXT: v_writelane_b32 v23, s55, 19 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 20 -; SI-NEXT: v_writelane_b32 v22, s55, 21 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 20 +; SI-NEXT: v_writelane_b32 v23, s55, 21 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 22 -; SI-NEXT: v_writelane_b32 v22, s55, 23 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 22 +; SI-NEXT: v_writelane_b32 v23, s55, 23 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 24 -; SI-NEXT: v_writelane_b32 v22, s55, 25 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 24 +; SI-NEXT: v_writelane_b32 v23, s55, 25 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 26 -; SI-NEXT: v_writelane_b32 v22, s55, 27 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 26 +; SI-NEXT: v_writelane_b32 v23, s55, 27 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 28 -; SI-NEXT: v_writelane_b32 v22, s55, 29 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 28 +; SI-NEXT: v_writelane_b32 v23, s55, 29 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 30 -; SI-NEXT: v_writelane_b32 v22, s55, 31 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 30 +; SI-NEXT: v_writelane_b32 v23, s55, 31 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 32 -; SI-NEXT: v_writelane_b32 v22, s55, 33 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 32 +; SI-NEXT: v_writelane_b32 v23, s55, 33 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 34 -; SI-NEXT: v_writelane_b32 v22, s55, 35 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 34 +; SI-NEXT: v_writelane_b32 v23, s55, 35 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 36 -; SI-NEXT: v_writelane_b32 v22, s55, 37 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 36 +; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v22, s54, 38 -; SI-NEXT: v_writelane_b32 v22, s55, 39 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 38 +; SI-NEXT: v_writelane_b32 v23, s55, 39 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: s_branch .LBB57_2 ; @@ -82945,47 +83137,75 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_writelane_b32 v20, s34, 2 -; VI-NEXT: v_writelane_b32 v20, s35, 3 -; VI-NEXT: v_writelane_b32 v20, s36, 4 -; VI-NEXT: v_writelane_b32 v20, s37, 5 -; VI-NEXT: v_writelane_b32 v20, s38, 6 -; VI-NEXT: v_writelane_b32 v20, s39, 7 -; VI-NEXT: v_writelane_b32 v20, s48, 8 -; VI-NEXT: v_writelane_b32 v20, s49, 9 -; VI-NEXT: v_writelane_b32 v20, s50, 10 -; VI-NEXT: v_writelane_b32 v20, s51, 11 -; VI-NEXT: v_writelane_b32 v20, s52, 12 -; VI-NEXT: v_writelane_b32 v20, s53, 13 -; VI-NEXT: v_writelane_b32 v20, s54, 14 -; VI-NEXT: v_writelane_b32 v20, s55, 15 -; VI-NEXT: v_writelane_b32 v20, s64, 16 -; VI-NEXT: v_writelane_b32 v20, s65, 17 -; VI-NEXT: v_writelane_b32 v20, s66, 18 -; VI-NEXT: v_writelane_b32 v20, s67, 19 -; VI-NEXT: v_writelane_b32 v20, s68, 20 -; VI-NEXT: v_writelane_b32 v20, s69, 21 -; VI-NEXT: v_writelane_b32 v20, s70, 22 -; VI-NEXT: v_writelane_b32 v20, s71, 23 -; VI-NEXT: v_writelane_b32 v20, s80, 24 -; VI-NEXT: v_writelane_b32 v20, s81, 25 -; VI-NEXT: v_writelane_b32 v20, s82, 26 -; VI-NEXT: v_writelane_b32 v20, s83, 27 -; VI-NEXT: v_writelane_b32 v20, s84, 28 -; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_writelane_b32 v21, s30, 0 +; VI-NEXT: v_writelane_b32 v21, s31, 1 +; VI-NEXT: v_writelane_b32 v21, s34, 2 +; VI-NEXT: v_writelane_b32 v21, s35, 3 +; VI-NEXT: v_writelane_b32 v21, s36, 4 +; VI-NEXT: v_writelane_b32 v21, s37, 5 +; VI-NEXT: v_writelane_b32 v21, s38, 6 +; VI-NEXT: v_writelane_b32 v21, s39, 7 +; VI-NEXT: v_writelane_b32 v21, s48, 8 +; VI-NEXT: v_writelane_b32 v21, s49, 9 +; VI-NEXT: v_writelane_b32 v21, s50, 10 +; VI-NEXT: v_writelane_b32 v21, s51, 11 +; VI-NEXT: v_writelane_b32 v21, s52, 12 +; VI-NEXT: v_writelane_b32 v21, s53, 13 +; VI-NEXT: v_writelane_b32 v21, s54, 14 +; VI-NEXT: v_writelane_b32 v21, s55, 15 +; VI-NEXT: v_writelane_b32 v21, s64, 16 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_writelane_b32 v21, s65, 17 +; VI-NEXT: v_readfirstlane_b32 s56, v20 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_writelane_b32 v21, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s57, v20 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_writelane_b32 v21, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s46, v20 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_writelane_b32 v21, s68, 20 +; VI-NEXT: v_readfirstlane_b32 s47, v20 +; VI-NEXT: v_mov_b32_e32 v20, s20 +; VI-NEXT: v_writelane_b32 v21, s69, 21 +; VI-NEXT: v_readfirstlane_b32 s44, v20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_writelane_b32 v21, s70, 22 +; VI-NEXT: v_readfirstlane_b32 s45, v20 +; VI-NEXT: v_mov_b32_e32 v20, s22 +; VI-NEXT: v_writelane_b32 v21, s71, 23 +; VI-NEXT: v_readfirstlane_b32 s42, v20 +; VI-NEXT: v_mov_b32_e32 v20, s23 +; VI-NEXT: v_writelane_b32 v21, s80, 24 +; VI-NEXT: v_readfirstlane_b32 s43, v20 +; VI-NEXT: v_mov_b32_e32 v20, s24 +; VI-NEXT: v_writelane_b32 v21, s81, 25 +; VI-NEXT: v_readfirstlane_b32 s40, v20 +; VI-NEXT: v_mov_b32_e32 v20, s25 +; VI-NEXT: v_writelane_b32 v21, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s41, v20 +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_writelane_b32 v21, s83, 27 +; VI-NEXT: v_readfirstlane_b32 s24, v20 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_writelane_b32 v21, s84, 28 +; VI-NEXT: v_readfirstlane_b32 s25, v20 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_writelane_b32 v21, s85, 29 +; VI-NEXT: v_readfirstlane_b32 s22, v20 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v20, s86, 30 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s45, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s43, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 -; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_writelane_b32 v21, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s23, v20 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s15, v8 ; VI-NEXT: v_readfirstlane_b32 s12, v9 @@ -82997,190 +83217,190 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s6, v15 ; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s4, v17 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v18 -; VI-NEXT: v_writelane_b32 v20, s87, 31 -; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: v_writelane_b32 v21, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB57_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 58 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 58 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 59 ; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 ; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 ; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 2 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 0 -; VI-NEXT: s_lshr_b32 s66, s27, 8 -; VI-NEXT: s_lshr_b32 s67, s26, 16 -; VI-NEXT: s_lshr_b32 s68, s26, 8 -; VI-NEXT: s_lshr_b32 s69, s25, 24 -; VI-NEXT: s_lshr_b32 s70, s25, 16 -; VI-NEXT: s_lshr_b32 s71, s25, 8 -; VI-NEXT: s_lshr_b32 s80, s24, 16 -; VI-NEXT: s_lshr_b32 s81, s24, 8 -; VI-NEXT: s_lshr_b32 s82, s23, 24 -; VI-NEXT: s_lshr_b32 s83, s23, 16 -; VI-NEXT: s_lshr_b32 s84, s23, 8 -; VI-NEXT: s_lshr_b32 s85, s22, 16 -; VI-NEXT: s_lshr_b32 s86, s22, 8 -; VI-NEXT: s_lshr_b32 s87, s21, 24 -; VI-NEXT: s_lshr_b32 s50, s21, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s20, 8 -; VI-NEXT: s_lshr_b32 s57, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s58, s17, 24 -; VI-NEXT: s_lshr_b32 s59, s17, 16 -; VI-NEXT: s_lshr_b32 s55, s17, 8 -; VI-NEXT: s_lshr_b32 s64, s16, 16 -; VI-NEXT: s_lshr_b32 s65, s16, 8 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: v_writelane_b32 v22, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s25, 8 +; VI-NEXT: s_lshr_b32 s67, s24, 16 +; VI-NEXT: s_lshr_b32 s68, s24, 8 +; VI-NEXT: s_lshr_b32 s69, s41, 24 +; VI-NEXT: s_lshr_b32 s70, s41, 16 +; VI-NEXT: s_lshr_b32 s71, s41, 8 +; VI-NEXT: s_lshr_b32 s80, s40, 16 +; VI-NEXT: s_lshr_b32 s81, s40, 8 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s83, s43, 16 +; VI-NEXT: s_lshr_b32 s84, s43, 8 +; VI-NEXT: s_lshr_b32 s85, s42, 16 +; VI-NEXT: s_lshr_b32 s86, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s45, 24 +; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: s_lshr_b32 s27, s44, 16 +; VI-NEXT: s_lshr_b32 s28, s44, 8 +; VI-NEXT: s_lshr_b32 s29, s47, 24 +; VI-NEXT: s_lshr_b32 s51, s47, 16 +; VI-NEXT: s_lshr_b32 s52, s47, 8 +; VI-NEXT: s_lshr_b32 s53, s46, 16 +; VI-NEXT: s_lshr_b32 s54, s46, 8 +; VI-NEXT: s_lshr_b32 s58, s57, 24 +; VI-NEXT: s_lshr_b32 s59, s57, 16 +; VI-NEXT: s_lshr_b32 s55, s57, 8 +; VI-NEXT: s_lshr_b32 s64, s56, 16 +; VI-NEXT: s_lshr_b32 s65, s56, 8 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s56, s56, 3 +; VI-NEXT: s_addc_u32 s57, s57, 0 +; VI-NEXT: s_add_u32 s46, s46, 3 +; VI-NEXT: s_addc_u32 s47, s47, 0 ; VI-NEXT: s_add_u32 s44, s44, 3 ; VI-NEXT: s_addc_u32 s45, s45, 0 ; VI-NEXT: s_add_u32 s42, s42, 3 ; VI-NEXT: s_addc_u32 s43, s43, 0 ; VI-NEXT: s_add_u32 s40, s40, 3 ; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 ; VI-NEXT: s_add_u32 s14, s14, 3 ; VI-NEXT: s_addc_u32 s15, s15, 0 ; VI-NEXT: s_add_u32 s12, s12, 3 @@ -83193,413 +83413,413 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s4, s4, 3 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 58 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 58 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 59 ; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 ; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 ; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v21, s60, 2 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b32 s66, s27, 8 -; VI-NEXT: s_lshr_b32 s67, s26, 16 -; VI-NEXT: s_lshr_b32 s68, s26, 8 -; VI-NEXT: s_lshr_b32 s69, s25, 24 -; VI-NEXT: s_lshr_b32 s70, s25, 16 -; VI-NEXT: s_lshr_b32 s71, s25, 8 -; VI-NEXT: s_lshr_b32 s80, s24, 16 -; VI-NEXT: s_lshr_b32 s81, s24, 8 -; VI-NEXT: s_lshr_b32 s82, s23, 24 -; VI-NEXT: s_lshr_b32 s83, s23, 16 -; VI-NEXT: s_lshr_b32 s84, s23, 8 -; VI-NEXT: s_lshr_b32 s85, s22, 16 -; VI-NEXT: s_lshr_b32 s86, s22, 8 -; VI-NEXT: s_lshr_b32 s87, s21, 24 -; VI-NEXT: s_lshr_b32 s50, s21, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s20, 8 -; VI-NEXT: s_lshr_b32 s57, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s58, s17, 24 -; VI-NEXT: s_lshr_b32 s59, s17, 16 -; VI-NEXT: s_lshr_b32 s55, s17, 8 -; VI-NEXT: s_lshr_b32 s64, s16, 16 -; VI-NEXT: s_lshr_b32 s65, s16, 8 -; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s25, 8 +; VI-NEXT: s_lshr_b32 s67, s24, 16 +; VI-NEXT: s_lshr_b32 s68, s24, 8 +; VI-NEXT: s_lshr_b32 s69, s41, 24 +; VI-NEXT: s_lshr_b32 s70, s41, 16 +; VI-NEXT: s_lshr_b32 s71, s41, 8 +; VI-NEXT: s_lshr_b32 s80, s40, 16 +; VI-NEXT: s_lshr_b32 s81, s40, 8 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s83, s43, 16 +; VI-NEXT: s_lshr_b32 s84, s43, 8 +; VI-NEXT: s_lshr_b32 s85, s42, 16 +; VI-NEXT: s_lshr_b32 s86, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s45, 24 +; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: s_lshr_b32 s27, s44, 16 +; VI-NEXT: s_lshr_b32 s28, s44, 8 +; VI-NEXT: s_lshr_b32 s29, s47, 24 +; VI-NEXT: s_lshr_b32 s51, s47, 16 +; VI-NEXT: s_lshr_b32 s52, s47, 8 +; VI-NEXT: s_lshr_b32 s53, s46, 16 +; VI-NEXT: s_lshr_b32 s54, s46, 8 +; VI-NEXT: s_lshr_b32 s58, s57, 24 +; VI-NEXT: s_lshr_b32 s59, s57, 16 +; VI-NEXT: s_lshr_b32 s55, s57, 8 +; VI-NEXT: s_lshr_b32 s64, s56, 16 +; VI-NEXT: s_lshr_b32 s65, s56, 8 +; VI-NEXT: v_writelane_b32 v22, s60, 0 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: .LBB57_3: ; %end ; VI-NEXT: s_lshl_b32 s61, s65, 8 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_and_b32 s56, s56, 0xff +; VI-NEXT: s_or_b32 s56, s56, s61 ; VI-NEXT: s_lshl_b32 s61, s48, 8 ; VI-NEXT: s_and_b32 s63, s64, 0xff ; VI-NEXT: s_or_b32 s61, s63, s61 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s56, s56, 0xffff ; VI-NEXT: s_lshl_b32 s61, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s61 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: s_lshl_b32 s17, s55, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_or_b32 s56, s56, s61 +; VI-NEXT: v_mov_b32_e32 v1, s56 +; VI-NEXT: s_and_b32 s56, s57, 0xff +; VI-NEXT: s_lshl_b32 s57, s55, 8 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, s59, 0xff ; VI-NEXT: s_lshl_b32 s58, s58, 8 -; VI-NEXT: s_or_b32 s17, s17, s58 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_lshl_b32 s16, s54, 8 -; VI-NEXT: s_and_b32 s17, s18, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s38, 8 -; VI-NEXT: s_and_b32 s18, s53, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v3, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xff -; VI-NEXT: s_lshl_b32 s17, s52, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s51, 0xff -; VI-NEXT: s_lshl_b32 s18, s57, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: s_lshl_b32 s16, s56, 8 -; VI-NEXT: s_and_b32 s17, s20, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s36, 8 -; VI-NEXT: s_and_b32 s18, s47, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v5, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xff -; VI-NEXT: s_lshl_b32 s17, s46, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s50, 0xff -; VI-NEXT: s_lshl_b32 s18, s87, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v6, s16 -; VI-NEXT: s_lshl_b32 s16, s86, 8 -; VI-NEXT: s_and_b32 s17, s22, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s34, 8 -; VI-NEXT: s_and_b32 s18, s85, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v7, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xff -; VI-NEXT: s_lshl_b32 s17, s84, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s83, 0xff -; VI-NEXT: s_lshl_b32 s18, s82, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: s_lshl_b32 s16, s81, 8 -; VI-NEXT: s_and_b32 s17, s24, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s30, 8 -; VI-NEXT: s_and_b32 s18, s80, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xff -; VI-NEXT: s_lshl_b32 s17, s71, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s70, 0xff -; VI-NEXT: s_lshl_b32 s18, s69, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: s_lshl_b32 s16, s68, 8 -; VI-NEXT: s_and_b32 s17, s26, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s90, 8 -; VI-NEXT: s_and_b32 s18, s67, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v11, s16 -; VI-NEXT: s_and_b32 s16, s27, 0xff -; VI-NEXT: s_lshl_b32 s17, s66, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 59 -; VI-NEXT: v_readlane_b32 s18, v21, 58 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v12, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 57 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s28, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 56 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s88, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 55 -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: s_and_b32 s16, s29, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 54 -; VI-NEXT: v_readlane_b32 s18, v21, 53 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 52 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s44, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 51 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s78, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s56, s56, 0xffff +; VI-NEXT: s_lshl_b32 s57, s57, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: v_mov_b32_e32 v2, s56 +; VI-NEXT: s_lshl_b32 s56, s54, 8 +; VI-NEXT: s_and_b32 s46, s46, 0xff +; VI-NEXT: s_or_b32 s46, s46, s56 +; VI-NEXT: s_lshl_b32 s56, s38, 8 +; VI-NEXT: s_and_b32 s57, s53, 0xff +; VI-NEXT: s_or_b32 s56, s57, s56 +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_lshl_b32 s56, s56, 16 +; VI-NEXT: s_or_b32 s46, s46, s56 +; VI-NEXT: v_mov_b32_e32 v3, s46 +; VI-NEXT: s_and_b32 s46, s47, 0xff +; VI-NEXT: s_lshl_b32 s47, s52, 8 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, s51, 0xff +; VI-NEXT: s_lshl_b32 s29, s29, 8 +; VI-NEXT: s_or_b32 s29, s47, s29 +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_lshl_b32 s29, s29, 16 +; VI-NEXT: s_or_b32 s29, s46, s29 +; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: s_lshl_b32 s28, s28, 8 +; VI-NEXT: s_and_b32 s29, s44, 0xff +; VI-NEXT: s_or_b32 s28, s29, s28 +; VI-NEXT: s_lshl_b32 s29, s36, 8 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_or_b32 s27, s27, s29 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: s_and_b32 s27, s45, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s27, s50, 0xff +; VI-NEXT: s_lshl_b32 s28, s87, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v6, s26 +; VI-NEXT: s_lshl_b32 s26, s86, 8 +; VI-NEXT: s_and_b32 s27, s42, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_lshl_b32 s27, s34, 8 +; VI-NEXT: s_and_b32 s28, s85, 0xff +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: s_and_b32 s26, s43, 0xff +; VI-NEXT: s_lshl_b32 s27, s84, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, s83, 0xff +; VI-NEXT: s_lshl_b32 s28, s82, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v8, s26 +; VI-NEXT: s_lshl_b32 s26, s81, 8 +; VI-NEXT: s_and_b32 s27, s40, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_lshl_b32 s27, s30, 8 +; VI-NEXT: s_and_b32 s28, s80, 0xff +; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: s_and_b32 s26, s41, 0xff +; VI-NEXT: s_lshl_b32 s27, s71, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, s70, 0xff +; VI-NEXT: s_lshl_b32 s28, s69, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_lshl_b32 s26, s68, 8 +; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_lshl_b32 s26, s90, 8 +; VI-NEXT: s_and_b32 s27, s67, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: s_and_b32 s24, s25, 0xff +; VI-NEXT: s_lshl_b32 s25, s66, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_readlane_b32 s25, v22, 59 +; VI-NEXT: v_readlane_b32 s26, v22, 58 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s25, s25, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_readlane_b32 s24, v22, 57 +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: v_readlane_b32 s25, v22, 56 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_lshl_b32 s24, s88, 8 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_or_b32 s24, s25, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s24, s24, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: s_and_b32 s22, s23, 0xff +; VI-NEXT: v_readlane_b32 s23, v22, 55 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_readlane_b32 s23, v22, 54 +; VI-NEXT: v_readlane_b32 s24, v22, 53 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s23, s23, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_readlane_b32 s22, v22, 52 +; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: v_readlane_b32 s23, v22, 51 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_lshl_b32 s22, s78, 8 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_or_b32 s22, s23, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: v_readlane_b32 s21, v22, 50 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: s_lshl_b32 s21, s21, 8 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s21, v22, 49 +; VI-NEXT: v_readlane_b32 s22, v22, 48 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: v_readlane_b32 s17, v21, 49 -; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: s_and_b32 s21, s21, 0xff +; VI-NEXT: s_lshl_b32 s22, s22, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s21, s21, s22 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: v_readlane_b32 s20, v22, 47 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s42, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: v_readlane_b32 s20, v22, 46 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: s_lshl_b32 s21, s76, 8 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 ; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: s_or_b32 s18, s18, s20 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s43, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 44 -; VI-NEXT: v_readlane_b32 s18, v21, 43 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: v_readlane_b32 s19, v22, 45 +; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_readlane_b32 s19, v22, 44 +; VI-NEXT: v_readlane_b32 s20, v22, 43 +; VI-NEXT: s_and_b32 s19, s19, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s40, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 41 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s74, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_readlane_b32 s18, v22, 42 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: v_readlane_b32 s18, v22, 41 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: v_readlane_b32 s17, v22, 40 ; VI-NEXT: s_lshl_b32 s17, s17, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 39 -; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: v_readlane_b32 s17, v22, 39 +; VI-NEXT: v_readlane_b32 s18, v22, 38 ; VI-NEXT: s_and_b32 s17, s17, 0xff ; VI-NEXT: s_lshl_b32 s18, s18, 8 ; VI-NEXT: s_or_b32 s17, s17, s18 @@ -83609,11 +83829,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: v_readlane_b32 s16, v22, 37 ; VI-NEXT: s_and_b32 s14, s14, 0xff ; VI-NEXT: s_lshl_b32 s16, s16, 8 ; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: v_readlane_b32 s16, v22, 36 ; VI-NEXT: s_and_b32 s16, s16, 0xff ; VI-NEXT: s_lshl_b32 s17, s72, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 @@ -83624,11 +83844,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: s_and_b32 s14, s15, 0xff -; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: v_readlane_b32 s15, v22, 35 ; VI-NEXT: s_lshl_b32 s15, s15, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: v_readlane_b32 s15, v21, 34 -; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: v_readlane_b32 s15, v22, 34 +; VI-NEXT: v_readlane_b32 s16, v22, 33 ; VI-NEXT: s_and_b32 s15, s15, 0xff ; VI-NEXT: s_lshl_b32 s16, s16, 8 ; VI-NEXT: s_or_b32 s15, s15, s16 @@ -83638,11 +83858,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: v_readlane_b32 s14, v22, 32 ; VI-NEXT: s_and_b32 s12, s12, 0xff ; VI-NEXT: s_lshl_b32 s14, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: v_readlane_b32 s14, v22, 31 ; VI-NEXT: s_and_b32 s14, s14, 0xff ; VI-NEXT: s_lshl_b32 s15, s62, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 @@ -83653,11 +83873,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 ; VI-NEXT: s_and_b32 s12, s13, 0xff -; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: v_readlane_b32 s13, v22, 30 ; VI-NEXT: s_lshl_b32 s13, s13, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: v_readlane_b32 s13, v21, 29 -; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: v_readlane_b32 s13, v22, 29 +; VI-NEXT: v_readlane_b32 s14, v22, 28 ; VI-NEXT: s_and_b32 s13, s13, 0xff ; VI-NEXT: s_lshl_b32 s14, s14, 8 ; VI-NEXT: s_or_b32 s13, s13, s14 @@ -83667,12 +83887,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: v_readlane_b32 s12, v22, 27 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 26 -; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: v_readlane_b32 s12, v22, 26 +; VI-NEXT: v_readlane_b32 s14, v22, 0 ; VI-NEXT: s_and_b32 s12, s12, 0xff ; VI-NEXT: s_lshl_b32 s13, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 @@ -83683,11 +83903,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_and_b32 s10, s11, 0xff -; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: v_readlane_b32 s11, v22, 25 ; VI-NEXT: s_lshl_b32 s11, s11, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: v_readlane_b32 s11, v21, 24 -; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: v_readlane_b32 s11, v22, 24 +; VI-NEXT: v_readlane_b32 s12, v22, 23 ; VI-NEXT: s_and_b32 s11, s11, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s11, s11, s12 @@ -83697,12 +83917,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: v_readlane_b32 s10, v22, 22 ; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 21 -; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: v_readlane_b32 s10, v22, 21 +; VI-NEXT: v_readlane_b32 s12, v22, 2 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s11, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 @@ -83713,11 +83933,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_and_b32 s8, s9, 0xff -; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: v_readlane_b32 s9, v22, 20 ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: v_readlane_b32 s9, v21, 19 -; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: v_readlane_b32 s9, v22, 19 +; VI-NEXT: v_readlane_b32 s10, v22, 18 ; VI-NEXT: s_and_b32 s9, s9, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 @@ -83727,12 +83947,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: v_readlane_b32 s8, v22, 17 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 16 -; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: v_readlane_b32 s8, v22, 16 +; VI-NEXT: v_readlane_b32 s10, v22, 4 ; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 @@ -83743,11 +83963,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s6, s7, 0xff -; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: v_readlane_b32 s7, v22, 15 ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 14 -; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: v_readlane_b32 s7, v22, 14 +; VI-NEXT: v_readlane_b32 s8, v22, 13 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 @@ -83757,12 +83977,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: v_readlane_b32 s6, v22, 12 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 11 -; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: v_readlane_b32 s6, v22, 11 +; VI-NEXT: v_readlane_b32 s8, v22, 6 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -83773,11 +83993,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: v_readlane_b32 s5, v22, 10 ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v21, 9 -; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 9 +; VI-NEXT: v_readlane_b32 s6, v22, 8 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -83788,46 +84008,46 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s15, v21, 1 -; VI-NEXT: v_readlane_b32 s13, v21, 3 -; VI-NEXT: v_readlane_b32 s11, v21, 5 -; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: v_readlane_b32 s15, v22, 1 +; VI-NEXT: v_readlane_b32 s13, v22, 3 +; VI-NEXT: v_readlane_b32 s11, v22, 5 +; VI-NEXT: v_readlane_b32 s9, v22, 7 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s87, v20, 31 -; VI-NEXT: v_readlane_b32 s86, v20, 30 -; VI-NEXT: v_readlane_b32 s85, v20, 29 -; VI-NEXT: v_readlane_b32 s84, v20, 28 -; VI-NEXT: v_readlane_b32 s83, v20, 27 -; VI-NEXT: v_readlane_b32 s82, v20, 26 -; VI-NEXT: v_readlane_b32 s81, v20, 25 -; VI-NEXT: v_readlane_b32 s80, v20, 24 -; VI-NEXT: v_readlane_b32 s71, v20, 23 -; VI-NEXT: v_readlane_b32 s70, v20, 22 -; VI-NEXT: v_readlane_b32 s69, v20, 21 -; VI-NEXT: v_readlane_b32 s68, v20, 20 -; VI-NEXT: v_readlane_b32 s67, v20, 19 -; VI-NEXT: v_readlane_b32 s66, v20, 18 -; VI-NEXT: v_readlane_b32 s65, v20, 17 -; VI-NEXT: v_readlane_b32 s64, v20, 16 -; VI-NEXT: v_readlane_b32 s55, v20, 15 -; VI-NEXT: v_readlane_b32 s54, v20, 14 -; VI-NEXT: v_readlane_b32 s53, v20, 13 -; VI-NEXT: v_readlane_b32 s52, v20, 12 -; VI-NEXT: v_readlane_b32 s51, v20, 11 -; VI-NEXT: v_readlane_b32 s50, v20, 10 -; VI-NEXT: v_readlane_b32 s49, v20, 9 -; VI-NEXT: v_readlane_b32 s48, v20, 8 -; VI-NEXT: v_readlane_b32 s39, v20, 7 -; VI-NEXT: v_readlane_b32 s38, v20, 6 -; VI-NEXT: v_readlane_b32 s37, v20, 5 -; VI-NEXT: v_readlane_b32 s36, v20, 4 -; VI-NEXT: v_readlane_b32 s35, v20, 3 -; VI-NEXT: v_readlane_b32 s34, v20, 2 -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s87, v21, 31 +; VI-NEXT: v_readlane_b32 s86, v21, 30 +; VI-NEXT: v_readlane_b32 s85, v21, 29 +; VI-NEXT: v_readlane_b32 s84, v21, 28 +; VI-NEXT: v_readlane_b32 s83, v21, 27 +; VI-NEXT: v_readlane_b32 s82, v21, 26 +; VI-NEXT: v_readlane_b32 s81, v21, 25 +; VI-NEXT: v_readlane_b32 s80, v21, 24 +; VI-NEXT: v_readlane_b32 s71, v21, 23 +; VI-NEXT: v_readlane_b32 s70, v21, 22 +; VI-NEXT: v_readlane_b32 s69, v21, 21 +; VI-NEXT: v_readlane_b32 s68, v21, 20 +; VI-NEXT: v_readlane_b32 s67, v21, 19 +; VI-NEXT: v_readlane_b32 s66, v21, 18 +; VI-NEXT: v_readlane_b32 s65, v21, 17 +; VI-NEXT: v_readlane_b32 s64, v21, 16 +; VI-NEXT: v_readlane_b32 s55, v21, 15 +; VI-NEXT: v_readlane_b32 s54, v21, 14 +; VI-NEXT: v_readlane_b32 s53, v21, 13 +; VI-NEXT: v_readlane_b32 s52, v21, 12 +; VI-NEXT: v_readlane_b32 s51, v21, 11 +; VI-NEXT: v_readlane_b32 s50, v21, 10 +; VI-NEXT: v_readlane_b32 s49, v21, 9 +; VI-NEXT: v_readlane_b32 s48, v21, 8 +; VI-NEXT: v_readlane_b32 s39, v21, 7 +; VI-NEXT: v_readlane_b32 s38, v21, 6 +; VI-NEXT: v_readlane_b32 s37, v21, 5 +; VI-NEXT: v_readlane_b32 s36, v21, 4 +; VI-NEXT: v_readlane_b32 s35, v21, 3 +; VI-NEXT: v_readlane_b32 s34, v21, 2 +; VI-NEXT: v_readlane_b32 s31, v21, 1 +; VI-NEXT: v_readlane_b32 s30, v21, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -83845,10 +84065,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr87 ; VI-NEXT: ; implicit-def: $sgpr86 @@ -83977,68 +84197,96 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; kill: killed $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 0 -; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: v_writelane_b32 v22, s60, 0 +; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 2 -; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: v_writelane_b32 v22, s60, 2 +; VI-NEXT: v_writelane_b32 v22, s61, 3 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 4 -; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: v_writelane_b32 v22, s60, 4 +; VI-NEXT: v_writelane_b32 v22, s61, 5 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v21, s60, 6 -; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: v_writelane_b32 v22, s60, 6 +; VI-NEXT: v_writelane_b32 v22, s61, 7 ; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v16i64_to_v128i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_writelane_b32 v20, s34, 2 -; GFX9-NEXT: v_writelane_b32 v20, s35, 3 -; GFX9-NEXT: v_writelane_b32 v20, s36, 4 -; GFX9-NEXT: v_writelane_b32 v20, s37, 5 -; GFX9-NEXT: v_writelane_b32 v20, s38, 6 -; GFX9-NEXT: v_writelane_b32 v20, s39, 7 -; GFX9-NEXT: v_writelane_b32 v20, s48, 8 -; GFX9-NEXT: v_writelane_b32 v20, s49, 9 -; GFX9-NEXT: v_writelane_b32 v20, s50, 10 -; GFX9-NEXT: v_writelane_b32 v20, s51, 11 -; GFX9-NEXT: v_writelane_b32 v20, s52, 12 -; GFX9-NEXT: v_writelane_b32 v20, s53, 13 -; GFX9-NEXT: v_writelane_b32 v20, s54, 14 -; GFX9-NEXT: v_writelane_b32 v20, s55, 15 -; GFX9-NEXT: v_writelane_b32 v20, s64, 16 -; GFX9-NEXT: v_writelane_b32 v20, s65, 17 -; GFX9-NEXT: v_writelane_b32 v20, s66, 18 -; GFX9-NEXT: v_writelane_b32 v20, s67, 19 -; GFX9-NEXT: v_writelane_b32 v20, s68, 20 -; GFX9-NEXT: v_writelane_b32 v20, s69, 21 -; GFX9-NEXT: v_writelane_b32 v20, s70, 22 -; GFX9-NEXT: v_writelane_b32 v20, s71, 23 -; GFX9-NEXT: v_writelane_b32 v20, s80, 24 -; GFX9-NEXT: v_writelane_b32 v20, s81, 25 -; GFX9-NEXT: v_writelane_b32 v20, s82, 26 -; GFX9-NEXT: v_writelane_b32 v20, s83, 27 -; GFX9-NEXT: v_writelane_b32 v20, s84, 28 -; GFX9-NEXT: v_writelane_b32 v20, s85, 29 -; GFX9-NEXT: v_writelane_b32 v20, s86, 30 -; GFX9-NEXT: v_writelane_b32 v20, s87, 31 -; GFX9-NEXT: v_writelane_b32 v20, s96, 32 -; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_writelane_b32 v21, s30, 0 +; GFX9-NEXT: v_writelane_b32 v21, s31, 1 +; GFX9-NEXT: v_writelane_b32 v21, s34, 2 +; GFX9-NEXT: v_writelane_b32 v21, s35, 3 +; GFX9-NEXT: v_writelane_b32 v21, s36, 4 +; GFX9-NEXT: v_writelane_b32 v21, s37, 5 +; GFX9-NEXT: v_writelane_b32 v21, s38, 6 +; GFX9-NEXT: v_writelane_b32 v21, s39, 7 +; GFX9-NEXT: v_writelane_b32 v21, s48, 8 +; GFX9-NEXT: v_writelane_b32 v21, s49, 9 +; GFX9-NEXT: v_writelane_b32 v21, s50, 10 +; GFX9-NEXT: v_writelane_b32 v21, s51, 11 +; GFX9-NEXT: v_writelane_b32 v21, s52, 12 +; GFX9-NEXT: v_writelane_b32 v21, s53, 13 +; GFX9-NEXT: v_writelane_b32 v21, s54, 14 +; GFX9-NEXT: v_writelane_b32 v21, s55, 15 +; GFX9-NEXT: v_writelane_b32 v21, s64, 16 +; GFX9-NEXT: v_writelane_b32 v21, s65, 17 +; GFX9-NEXT: v_writelane_b32 v21, s66, 18 +; GFX9-NEXT: v_writelane_b32 v21, s67, 19 +; GFX9-NEXT: v_writelane_b32 v21, s68, 20 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_writelane_b32 v21, s69, 21 +; GFX9-NEXT: v_readfirstlane_b32 s56, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_writelane_b32 v21, s70, 22 +; GFX9-NEXT: v_readfirstlane_b32 s57, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_writelane_b32 v21, s71, 23 +; GFX9-NEXT: v_readfirstlane_b32 s46, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_writelane_b32 v21, s80, 24 +; GFX9-NEXT: v_readfirstlane_b32 s47, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 +; GFX9-NEXT: v_writelane_b32 v21, s81, 25 +; GFX9-NEXT: v_readfirstlane_b32 s44, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_writelane_b32 v21, s82, 26 +; GFX9-NEXT: v_readfirstlane_b32 s45, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s22 +; GFX9-NEXT: v_writelane_b32 v21, s83, 27 +; GFX9-NEXT: v_readfirstlane_b32 s42, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s23 +; GFX9-NEXT: v_writelane_b32 v21, s84, 28 +; GFX9-NEXT: v_readfirstlane_b32 s43, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_writelane_b32 v21, s85, 29 +; GFX9-NEXT: v_readfirstlane_b32 s40, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s25 +; GFX9-NEXT: v_writelane_b32 v21, s86, 30 +; GFX9-NEXT: v_readfirstlane_b32 s41, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_writelane_b32 v21, s87, 31 +; GFX9-NEXT: v_readfirstlane_b32 s24, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_writelane_b32 v21, s96, 32 +; GFX9-NEXT: v_readfirstlane_b32 s25, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_writelane_b32 v21, s97, 33 +; GFX9-NEXT: v_readfirstlane_b32 s22, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_writelane_b32 v20, s98, 34 -; GFX9-NEXT: v_readfirstlane_b32 s44, v1 -; GFX9-NEXT: v_readfirstlane_b32 s45, v2 -; GFX9-NEXT: v_readfirstlane_b32 s42, v3 -; GFX9-NEXT: v_readfirstlane_b32 s43, v4 -; GFX9-NEXT: v_readfirstlane_b32 s40, v5 -; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_writelane_b32 v21, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s23, v20 +; GFX9-NEXT: v_readfirstlane_b32 s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s21, v2 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 ; GFX9-NEXT: v_readfirstlane_b32 s14, v7 ; GFX9-NEXT: v_readfirstlane_b32 s15, v8 ; GFX9-NEXT: v_readfirstlane_b32 s12, v9 @@ -84050,181 +84298,181 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: v_readfirstlane_b32 s6, v15 ; GFX9-NEXT: v_readfirstlane_b32 s7, v16 ; GFX9-NEXT: v_readfirstlane_b32 s4, v17 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v18 -; GFX9-NEXT: v_writelane_b32 v20, s99, 35 -; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: v_writelane_b32 v21, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 -; GFX9-NEXT: s_lshr_b32 s84, s27, 8 -; GFX9-NEXT: s_lshr_b32 s85, s26, 16 -; GFX9-NEXT: s_lshr_b32 s86, s26, 8 -; GFX9-NEXT: s_lshr_b32 s87, s25, 24 -; GFX9-NEXT: s_lshr_b32 s96, s25, 16 -; GFX9-NEXT: s_lshr_b32 s97, s25, 8 -; GFX9-NEXT: s_lshr_b32 s98, s24, 16 -; GFX9-NEXT: s_lshr_b32 s99, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s39, s23, 16 -; GFX9-NEXT: s_lshr_b32 s48, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s50, s22, 8 -; GFX9-NEXT: s_lshr_b32 s51, s21, 24 -; GFX9-NEXT: s_lshr_b32 s52, s21, 16 -; GFX9-NEXT: s_lshr_b32 s53, s21, 8 -; GFX9-NEXT: s_lshr_b32 s54, s20, 16 -; GFX9-NEXT: s_lshr_b32 s55, s20, 8 -; GFX9-NEXT: s_lshr_b32 s64, s19, 24 -; GFX9-NEXT: s_lshr_b32 s65, s19, 16 -; GFX9-NEXT: s_lshr_b32 s66, s19, 8 -; GFX9-NEXT: s_lshr_b32 s67, s18, 16 -; GFX9-NEXT: s_lshr_b32 s68, s18, 8 -; GFX9-NEXT: s_lshr_b32 s69, s17, 24 -; GFX9-NEXT: s_lshr_b32 s70, s17, 16 -; GFX9-NEXT: s_lshr_b32 s71, s17, 8 -; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 50 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s28, 0 +; GFX9-NEXT: s_lshr_b32 s82, s22, 8 +; GFX9-NEXT: s_lshr_b32 s83, s25, 24 +; GFX9-NEXT: s_lshr_b32 s81, s25, 16 +; GFX9-NEXT: s_lshr_b32 s84, s25, 8 +; GFX9-NEXT: s_lshr_b32 s85, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s97, s41, 8 +; GFX9-NEXT: s_lshr_b32 s98, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s48, s43, 8 +; GFX9-NEXT: s_lshr_b32 s49, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s53, s45, 8 +; GFX9-NEXT: s_lshr_b32 s54, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s66, s47, 8 +; GFX9-NEXT: s_lshr_b32 s67, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 +; GFX9-NEXT: s_lshr_b32 s80, s56, 16 +; GFX9-NEXT: s_lshr_b32 s26, s56, 8 +; GFX9-NEXT: v_writelane_b32 v22, s29, 1 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB57_3 ; GFX9-NEXT: .LBB57_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s16, s16, 3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_add_u32 s18, s18, 3 -; GFX9-NEXT: s_addc_u32 s19, s19, 0 -; GFX9-NEXT: s_add_u32 s20, s20, 3 -; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_add_u32 s22, s22, 3 -; GFX9-NEXT: s_addc_u32 s23, s23, 0 -; GFX9-NEXT: s_add_u32 s24, s24, 3 -; GFX9-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s56, s56, 3 +; GFX9-NEXT: s_addc_u32 s57, s57, 0 +; GFX9-NEXT: s_add_u32 s46, s46, 3 +; GFX9-NEXT: s_addc_u32 s47, s47, 0 ; GFX9-NEXT: s_add_u32 s44, s44, 3 ; GFX9-NEXT: s_addc_u32 s45, s45, 0 ; GFX9-NEXT: s_add_u32 s42, s42, 3 ; GFX9-NEXT: s_addc_u32 s43, s43, 0 ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_add_u32 s14, s14, 3 ; GFX9-NEXT: s_addc_u32 s15, s15, 0 ; GFX9-NEXT: s_add_u32 s12, s12, 3 @@ -84237,298 +84485,297 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s4, s4, 3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s12, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s12, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s14, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s14, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s40, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s40, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s42, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s42, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s44, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s44, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 50 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v21, s56, 0 -; GFX9-NEXT: s_lshr_b32 s82, s28, 8 -; GFX9-NEXT: s_lshr_b32 s83, s27, 24 -; GFX9-NEXT: s_lshr_b32 s81, s27, 16 -; GFX9-NEXT: s_lshr_b32 s84, s27, 8 -; GFX9-NEXT: s_lshr_b32 s85, s26, 16 -; GFX9-NEXT: s_lshr_b32 s86, s26, 8 -; GFX9-NEXT: s_lshr_b32 s87, s25, 24 -; GFX9-NEXT: s_lshr_b32 s96, s25, 16 -; GFX9-NEXT: s_lshr_b32 s97, s25, 8 -; GFX9-NEXT: s_lshr_b32 s98, s24, 16 -; GFX9-NEXT: s_lshr_b32 s99, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s39, s23, 16 -; GFX9-NEXT: s_lshr_b32 s48, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s50, s22, 8 -; GFX9-NEXT: s_lshr_b32 s51, s21, 24 -; GFX9-NEXT: s_lshr_b32 s52, s21, 16 -; GFX9-NEXT: s_lshr_b32 s53, s21, 8 -; GFX9-NEXT: s_lshr_b32 s54, s20, 16 -; GFX9-NEXT: s_lshr_b32 s55, s20, 8 -; GFX9-NEXT: s_lshr_b32 s64, s19, 24 -; GFX9-NEXT: s_lshr_b32 s65, s19, 16 -; GFX9-NEXT: s_lshr_b32 s66, s19, 8 -; GFX9-NEXT: s_lshr_b32 s67, s18, 16 -; GFX9-NEXT: s_lshr_b32 s68, s18, 8 -; GFX9-NEXT: s_lshr_b32 s69, s17, 24 -; GFX9-NEXT: s_lshr_b32 s70, s17, 16 -; GFX9-NEXT: s_lshr_b32 s71, s17, 8 -; GFX9-NEXT: s_lshr_b32 s80, s16, 16 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v22, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 50 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s28, 0 +; GFX9-NEXT: s_lshr_b32 s82, s22, 8 +; GFX9-NEXT: s_lshr_b32 s83, s25, 24 +; GFX9-NEXT: s_lshr_b32 s81, s25, 16 +; GFX9-NEXT: s_lshr_b32 s84, s25, 8 +; GFX9-NEXT: s_lshr_b32 s85, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s97, s41, 8 +; GFX9-NEXT: s_lshr_b32 s98, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s48, s43, 8 +; GFX9-NEXT: s_lshr_b32 s49, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s53, s45, 8 +; GFX9-NEXT: s_lshr_b32 s54, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s66, s47, 8 +; GFX9-NEXT: s_lshr_b32 s67, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 +; GFX9-NEXT: s_lshr_b32 s80, s56, 16 +; GFX9-NEXT: s_lshr_b32 s26, s56, 8 +; GFX9-NEXT: v_writelane_b32 v22, s29, 1 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: .LBB57_3: ; %end -; GFX9-NEXT: s_lshl_b32 s46, s46, 8 -; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: s_lshl_b32 s46, s36, 8 -; GFX9-NEXT: s_and_b32 s47, s80, 0xff -; GFX9-NEXT: s_or_b32 s46, s47, s46 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s46, s46, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s46 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s71, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s46, s69, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s46 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_lshl_b32 s16, s68, 8 -; GFX9-NEXT: s_and_b32 s17, s18, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s34, 8 -; GFX9-NEXT: s_and_b32 s18, s67, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: s_and_b32 s16, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s66, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s65, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s64, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_lshl_b32 s16, s55, 8 -; GFX9-NEXT: s_and_b32 s17, s20, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s30, 8 -; GFX9-NEXT: s_and_b32 s18, s54, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-NEXT: s_and_b32 s16, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s53, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s51, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: s_lshl_b32 s16, s50, 8 -; GFX9-NEXT: s_and_b32 s17, s22, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s94, 8 -; GFX9-NEXT: s_and_b32 s18, s49, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v7, s16 -; GFX9-NEXT: s_and_b32 s16, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s48, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s38, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_lshl_b32 s16, s99, 8 -; GFX9-NEXT: s_and_b32 s17, s24, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s92, 8 -; GFX9-NEXT: s_and_b32 s18, s98, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: s_and_b32 s16, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s97, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s96, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s87, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-NEXT: s_lshl_b32 s16, s86, 8 -; GFX9-NEXT: s_and_b32 s17, s26, 0xff -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s90, 8 -; GFX9-NEXT: s_and_b32 s18, s85, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v11, s16 -; GFX9-NEXT: s_and_b32 s16, s27, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s84, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: s_and_b32 s17, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s83, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-NEXT: s_lshl_b32 s16, s82, 8 -; GFX9-NEXT: s_and_b32 s17, s28, 0xff -; GFX9-NEXT: v_readlane_b32 s18, v21, 50 -; GFX9-NEXT: s_or_b32 s16, s17, s16 -; GFX9-NEXT: s_lshl_b32 s17, s88, 8 -; GFX9-NEXT: s_and_b32 s18, s18, 0xff -; GFX9-NEXT: s_or_b32 s17, s18, s17 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 49 -; GFX9-NEXT: v_mov_b32_e32 v13, s16 -; GFX9-NEXT: s_and_b32 s16, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 48 -; GFX9-NEXT: v_readlane_b32 s18, v21, 47 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: s_lshl_b32 s26, s26, 8 +; GFX9-NEXT: s_and_b32 s27, s56, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s36, 8 +; GFX9-NEXT: s_and_b32 s29, s80, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-NEXT: s_and_b32 s26, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s71, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s69, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: s_lshl_b32 s26, s68, 8 +; GFX9-NEXT: s_and_b32 s27, s46, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s34, 8 +; GFX9-NEXT: s_and_b32 s29, s67, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-NEXT: s_and_b32 s26, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s66, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s64, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s26 +; GFX9-NEXT: s_lshl_b32 s26, s55, 8 +; GFX9-NEXT: s_and_b32 s27, s44, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s30, 8 +; GFX9-NEXT: s_and_b32 s29, s54, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: s_and_b32 s26, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s53, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s51, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v6, s26 +; GFX9-NEXT: s_lshl_b32 s26, s50, 8 +; GFX9-NEXT: s_and_b32 s27, s42, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s94, 8 +; GFX9-NEXT: s_and_b32 s29, s49, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v7, s26 +; GFX9-NEXT: s_and_b32 s26, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s48, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s38, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v8, s26 +; GFX9-NEXT: s_lshl_b32 s26, s99, 8 +; GFX9-NEXT: s_and_b32 s27, s40, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s27, s92, 8 +; GFX9-NEXT: s_and_b32 s29, s98, 0xff +; GFX9-NEXT: s_or_b32 s27, s29, s27 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v9, s26 +; GFX9-NEXT: s_and_b32 s26, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s97, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: s_and_b32 s27, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s87, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s29 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_lshl_b32 s26, s86, 8 +; GFX9-NEXT: s_and_b32 s24, s24, 0xff +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: s_lshl_b32 s26, s90, 8 +; GFX9-NEXT: s_and_b32 s27, s85, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s26, s26, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: s_and_b32 s24, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s25, s84, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: s_and_b32 s25, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s26, s83, 8 +; GFX9-NEXT: s_or_b32 s25, s25, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s25, s25, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: s_lshl_b32 s24, s82, 8 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: v_readlane_b32 s25, v22, 50 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: s_lshl_b32 s24, s88, 8 +; GFX9-NEXT: s_and_b32 s25, s25, 0xff +; GFX9-NEXT: s_or_b32 s24, s25, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: s_and_b32 s22, s23, 0xff +; GFX9-NEXT: v_readlane_b32 s23, v22, 49 +; GFX9-NEXT: s_lshl_b32 s23, s23, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: v_readlane_b32 s23, v22, 48 +; GFX9-NEXT: v_readlane_b32 s24, v22, 47 +; GFX9-NEXT: s_and_b32 s23, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s24, 8 +; GFX9-NEXT: s_or_b32 s23, s23, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -84542,79 +84789,80 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s44, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 45 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s78, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_readlane_b32 s22, v22, 46 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s22 +; GFX9-NEXT: v_readlane_b32 s22, v22, 45 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s78, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s22, s22, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s22 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 43 -; GFX9-NEXT: v_readlane_b32 s18, v21, 42 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: s_and_b32 s20, s21, 0xff +; GFX9-NEXT: v_readlane_b32 s21, v22, 44 +; GFX9-NEXT: s_lshl_b32 s21, s21, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: v_readlane_b32 s21, v22, 43 +; GFX9-NEXT: v_readlane_b32 s22, v22, 42 +; GFX9-NEXT: s_and_b32 s21, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s22 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s42, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 40 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s76, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_readlane_b32 s20, v22, 41 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s20 +; GFX9-NEXT: v_readlane_b32 s20, v22, 40 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s76, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s20, s20, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s20 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 38 -; GFX9-NEXT: v_readlane_b32 s18, v21, 37 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s18, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 -; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: v_readlane_b32 s19, v22, 39 +; GFX9-NEXT: s_lshl_b32 s19, s19, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: v_readlane_b32 s19, v22, 38 +; GFX9-NEXT: v_readlane_b32 s20, v22, 37 +; GFX9-NEXT: s_and_b32 s19, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s20 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s40, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s17, 8 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 35 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s74, 8 -; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_readlane_b32 s18, v22, 36 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: v_readlane_b32 s18, v22, 35 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s74, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff -; GFX9-NEXT: s_lshl_b32 s17, s17, 16 -; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: v_readlane_b32 s17, v22, 34 ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 33 -; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: v_readlane_b32 s17, v22, 33 +; GFX9-NEXT: v_readlane_b32 s18, v22, 32 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -84623,11 +84871,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: v_readlane_b32 s16, v22, 31 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 30 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s17, s72, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 @@ -84637,11 +84885,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: v_readlane_b32 s15, v22, 29 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v21, 28 -; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: v_readlane_b32 s15, v22, 28 +; GFX9-NEXT: v_readlane_b32 s16, v22, 27 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -84650,11 +84898,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: v_readlane_b32 s14, v22, 26 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 25 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s15, s62, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 @@ -84664,11 +84912,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: v_readlane_b32 s13, v22, 24 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v21, 23 -; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: v_readlane_b32 s13, v22, 23 +; GFX9-NEXT: v_readlane_b32 s14, v22, 22 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -84677,11 +84925,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: v_readlane_b32 s12, v22, 21 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 20 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s13, s60, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 @@ -84691,11 +84939,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: v_readlane_b32 s11, v22, 19 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v21, 18 -; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: v_readlane_b32 s11, v22, 18 +; GFX9-NEXT: v_readlane_b32 s12, v22, 17 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -84704,11 +84952,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: v_readlane_b32 s10, v22, 16 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: v_readlane_b32 s10, v22, 15 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s58, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 @@ -84718,11 +84966,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: v_readlane_b32 s9, v22, 14 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v21, 13 -; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: v_readlane_b32 s9, v22, 13 +; GFX9-NEXT: v_readlane_b32 s10, v22, 12 ; GFX9-NEXT: s_and_b32 s9, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 @@ -84731,13 +84979,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: v_readlane_b32 s8, v22, 11 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: v_readlane_b32 s8, v22, 10 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_lshl_b32 s9, s28, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 @@ -84745,11 +84993,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: v_readlane_b32 s7, v22, 9 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v21, 8 -; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: v_readlane_b32 s7, v22, 8 +; GFX9-NEXT: v_readlane_b32 s8, v22, 7 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 @@ -84758,12 +85006,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: v_readlane_b32 s6, v22, 6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s6, v22, 5 +; GFX9-NEXT: v_readlane_b32 s8, v22, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -84773,11 +85021,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: v_readlane_b32 s5, v22, 4 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v21, 3 -; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: v_readlane_b32 s5, v22, 3 +; GFX9-NEXT: v_readlane_b32 s6, v22, 2 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -84786,61 +85034,61 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: v_readlane_b32 s9, v22, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: v_readlane_b32 s99, v20, 35 -; GFX9-NEXT: v_readlane_b32 s98, v20, 34 -; GFX9-NEXT: v_readlane_b32 s97, v20, 33 -; GFX9-NEXT: v_readlane_b32 s96, v20, 32 -; GFX9-NEXT: v_readlane_b32 s87, v20, 31 -; GFX9-NEXT: v_readlane_b32 s86, v20, 30 -; GFX9-NEXT: v_readlane_b32 s85, v20, 29 -; GFX9-NEXT: v_readlane_b32 s84, v20, 28 -; GFX9-NEXT: v_readlane_b32 s83, v20, 27 -; GFX9-NEXT: v_readlane_b32 s82, v20, 26 -; GFX9-NEXT: v_readlane_b32 s81, v20, 25 -; GFX9-NEXT: v_readlane_b32 s80, v20, 24 -; GFX9-NEXT: v_readlane_b32 s71, v20, 23 -; GFX9-NEXT: v_readlane_b32 s70, v20, 22 -; GFX9-NEXT: v_readlane_b32 s69, v20, 21 -; GFX9-NEXT: v_readlane_b32 s68, v20, 20 -; GFX9-NEXT: v_readlane_b32 s67, v20, 19 -; GFX9-NEXT: v_readlane_b32 s66, v20, 18 -; GFX9-NEXT: v_readlane_b32 s65, v20, 17 -; GFX9-NEXT: v_readlane_b32 s64, v20, 16 -; GFX9-NEXT: v_readlane_b32 s55, v20, 15 -; GFX9-NEXT: v_readlane_b32 s54, v20, 14 -; GFX9-NEXT: v_readlane_b32 s53, v20, 13 -; GFX9-NEXT: v_readlane_b32 s52, v20, 12 -; GFX9-NEXT: v_readlane_b32 s51, v20, 11 -; GFX9-NEXT: v_readlane_b32 s50, v20, 10 -; GFX9-NEXT: v_readlane_b32 s49, v20, 9 -; GFX9-NEXT: v_readlane_b32 s48, v20, 8 -; GFX9-NEXT: v_readlane_b32 s39, v20, 7 -; GFX9-NEXT: v_readlane_b32 s38, v20, 6 -; GFX9-NEXT: v_readlane_b32 s37, v20, 5 -; GFX9-NEXT: v_readlane_b32 s36, v20, 4 -; GFX9-NEXT: v_readlane_b32 s35, v20, 3 -; GFX9-NEXT: v_readlane_b32 s34, v20, 2 -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: v_readlane_b32 s99, v21, 35 +; GFX9-NEXT: v_readlane_b32 s98, v21, 34 +; GFX9-NEXT: v_readlane_b32 s97, v21, 33 +; GFX9-NEXT: v_readlane_b32 s96, v21, 32 +; GFX9-NEXT: v_readlane_b32 s87, v21, 31 +; GFX9-NEXT: v_readlane_b32 s86, v21, 30 +; GFX9-NEXT: v_readlane_b32 s85, v21, 29 +; GFX9-NEXT: v_readlane_b32 s84, v21, 28 +; GFX9-NEXT: v_readlane_b32 s83, v21, 27 +; GFX9-NEXT: v_readlane_b32 s82, v21, 26 +; GFX9-NEXT: v_readlane_b32 s81, v21, 25 +; GFX9-NEXT: v_readlane_b32 s80, v21, 24 +; GFX9-NEXT: v_readlane_b32 s71, v21, 23 +; GFX9-NEXT: v_readlane_b32 s70, v21, 22 +; GFX9-NEXT: v_readlane_b32 s69, v21, 21 +; GFX9-NEXT: v_readlane_b32 s68, v21, 20 +; GFX9-NEXT: v_readlane_b32 s67, v21, 19 +; GFX9-NEXT: v_readlane_b32 s66, v21, 18 +; GFX9-NEXT: v_readlane_b32 s65, v21, 17 +; GFX9-NEXT: v_readlane_b32 s64, v21, 16 +; GFX9-NEXT: v_readlane_b32 s55, v21, 15 +; GFX9-NEXT: v_readlane_b32 s54, v21, 14 +; GFX9-NEXT: v_readlane_b32 s53, v21, 13 +; GFX9-NEXT: v_readlane_b32 s52, v21, 12 +; GFX9-NEXT: v_readlane_b32 s51, v21, 11 +; GFX9-NEXT: v_readlane_b32 s50, v21, 10 +; GFX9-NEXT: v_readlane_b32 s49, v21, 9 +; GFX9-NEXT: v_readlane_b32 s48, v21, 8 +; GFX9-NEXT: v_readlane_b32 s39, v21, 7 +; GFX9-NEXT: v_readlane_b32 s38, v21, 6 +; GFX9-NEXT: v_readlane_b32 s37, v21, 5 +; GFX9-NEXT: v_readlane_b32 s36, v21, 4 +; GFX9-NEXT: v_readlane_b32 s35, v21, 3 +; GFX9-NEXT: v_readlane_b32 s34, v21, 2 +; GFX9-NEXT: v_readlane_b32 s31, v21, 1 +; GFX9-NEXT: v_readlane_b32 s30, v21, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB57_4: -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: v_writelane_b32 v21, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: v_writelane_b32 v22, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr70 @@ -84883,101 +85131,101 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: v_writelane_b32 v21, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: v_writelane_b32 v22, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 -; GFX9-NEXT: ; implicit-def: $sgpr47 -; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: s_branch .LBB57_2 ; ; GFX11-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -84985,213 +85233,240 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v16, s32 -; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v34, s32 +; GFX11-NEXT: scratch_store_b32 off, v35, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v36, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v37, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v16, s30, 0 -; GFX11-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-NEXT: v_writelane_b32 v34, s30, 0 +; GFX11-NEXT: v_writelane_b32 v35, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-NEXT: v_writelane_b32 v34, s31, 1 +; GFX11-NEXT: v_writelane_b32 v35, s97, 1 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_writelane_b32 v34, s34, 2 +; GFX11-NEXT: v_writelane_b32 v35, s98, 2 +; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21 +; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23 +; GFX11-NEXT: v_writelane_b32 v34, s35, 3 +; GFX11-NEXT: v_writelane_b32 v35, s99, 3 +; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25 +; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27 +; GFX11-NEXT: v_writelane_b32 v34, s36, 4 +; GFX11-NEXT: v_writelane_b32 v35, s100, 4 +; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s40, v1 -; GFX11-NEXT: v_readfirstlane_b32 s41, v2 -; GFX11-NEXT: v_writelane_b32 v16, s31, 1 -; GFX11-NEXT: v_writelane_b32 v17, s97, 1 -; GFX11-NEXT: v_readfirstlane_b32 s14, v3 -; GFX11-NEXT: v_readfirstlane_b32 s15, v4 -; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v16, s34, 2 -; GFX11-NEXT: v_writelane_b32 v17, s98, 2 -; GFX11-NEXT: v_readfirstlane_b32 s13, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 -; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v16, s35, 3 -; GFX11-NEXT: v_writelane_b32 v17, s99, 3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v9 -; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v16, s36, 4 -; GFX11-NEXT: v_writelane_b32 v17, s100, 4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v13 -; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v16, s37, 5 -; GFX11-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-NEXT: v_writelane_b32 v34, s37, 5 +; GFX11-NEXT: v_writelane_b32 v35, s101, 5 +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_writelane_b32 v34, s38, 6 +; GFX11-NEXT: v_writelane_b32 v35, s102, 6 +; GFX11-NEXT: v_readfirstlane_b32 s29, v19 +; GFX11-NEXT: v_readfirstlane_b32 s26, v20 +; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_writelane_b32 v34, s39, 7 +; GFX11-NEXT: v_writelane_b32 v35, s103, 7 +; GFX11-NEXT: v_readfirstlane_b32 s24, v22 +; GFX11-NEXT: v_readfirstlane_b32 s25, v23 +; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_writelane_b32 v34, s48, 8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s20, v26 +; GFX11-NEXT: v_readfirstlane_b32 s21, v27 +; GFX11-NEXT: v_readfirstlane_b32 s18, v28 +; GFX11-NEXT: v_writelane_b32 v34, s49, 9 +; GFX11-NEXT: v_readfirstlane_b32 s19, v29 +; GFX11-NEXT: v_readfirstlane_b32 s16, v30 +; GFX11-NEXT: v_readfirstlane_b32 s17, v31 +; GFX11-NEXT: v_readfirstlane_b32 s14, v32 +; GFX11-NEXT: v_writelane_b32 v34, s50, 10 +; GFX11-NEXT: v_readfirstlane_b32 s15, v33 +; GFX11-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-NEXT: v_writelane_b32 v34, s51, 11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_writelane_b32 v34, s52, 12 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_writelane_b32 v34, s53, 13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 ; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: v_writelane_b32 v34, s54, 14 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v16, s38, 6 -; GFX11-NEXT: v_writelane_b32 v17, s102, 6 -; GFX11-NEXT: v_writelane_b32 v16, s39, 7 -; GFX11-NEXT: v_writelane_b32 v17, s103, 7 -; GFX11-NEXT: v_writelane_b32 v16, s48, 8 -; GFX11-NEXT: v_writelane_b32 v17, s104, 8 -; GFX11-NEXT: v_writelane_b32 v16, s49, 9 -; GFX11-NEXT: v_writelane_b32 v16, s50, 10 -; GFX11-NEXT: v_writelane_b32 v16, s51, 11 -; GFX11-NEXT: v_writelane_b32 v16, s52, 12 -; GFX11-NEXT: v_writelane_b32 v16, s53, 13 -; GFX11-NEXT: v_writelane_b32 v16, s54, 14 -; GFX11-NEXT: v_writelane_b32 v16, s55, 15 -; GFX11-NEXT: v_writelane_b32 v16, s64, 16 -; GFX11-NEXT: v_writelane_b32 v16, s65, 17 -; GFX11-NEXT: v_writelane_b32 v16, s66, 18 -; GFX11-NEXT: v_writelane_b32 v16, s67, 19 -; GFX11-NEXT: v_writelane_b32 v16, s68, 20 -; GFX11-NEXT: v_writelane_b32 v16, s69, 21 -; GFX11-NEXT: v_writelane_b32 v16, s70, 22 -; GFX11-NEXT: v_writelane_b32 v16, s71, 23 -; GFX11-NEXT: v_writelane_b32 v16, s80, 24 -; GFX11-NEXT: v_writelane_b32 v16, s81, 25 -; GFX11-NEXT: v_writelane_b32 v16, s82, 26 -; GFX11-NEXT: v_writelane_b32 v16, s83, 27 -; GFX11-NEXT: v_writelane_b32 v16, s84, 28 -; GFX11-NEXT: v_writelane_b32 v16, s85, 29 -; GFX11-NEXT: v_writelane_b32 v16, s86, 30 -; GFX11-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-NEXT: v_writelane_b32 v35, s104, 8 +; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v34, s55, 15 +; GFX11-NEXT: v_writelane_b32 v34, s64, 16 +; GFX11-NEXT: v_writelane_b32 v34, s65, 17 +; GFX11-NEXT: v_writelane_b32 v34, s66, 18 +; GFX11-NEXT: v_writelane_b32 v34, s67, 19 +; GFX11-NEXT: v_writelane_b32 v34, s68, 20 +; GFX11-NEXT: v_writelane_b32 v34, s69, 21 +; GFX11-NEXT: v_writelane_b32 v34, s70, 22 +; GFX11-NEXT: v_writelane_b32 v34, s71, 23 +; GFX11-NEXT: v_writelane_b32 v34, s80, 24 +; GFX11-NEXT: v_writelane_b32 v34, s81, 25 +; GFX11-NEXT: v_writelane_b32 v34, s82, 26 +; GFX11-NEXT: v_writelane_b32 v34, s83, 27 +; GFX11-NEXT: v_writelane_b32 v34, s84, 28 +; GFX11-NEXT: v_writelane_b32 v34, s85, 29 +; GFX11-NEXT: v_writelane_b32 v34, s86, 30 +; GFX11-NEXT: v_writelane_b32 v34, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB57_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s104, s1, 24 +; GFX11-NEXT: s_lshr_b32 s102, s1, 16 +; GFX11-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 17 +; GFX11-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-NEXT: s_lshr_b32 s47, s0, 8 +; GFX11-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 18 ; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 +; GFX11-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-NEXT: s_lshr_b32 s69, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 19 ; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-NEXT: s_lshr_b32 s56, s2, 8 +; GFX11-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 20 ; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 21 ; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 22 ; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 23 +; GFX11-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 24 +; GFX11-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 25 +; GFX11-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s58, s10, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 26 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 27 +; GFX11-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 8 +; GFX11-NEXT: s_lshr_b32 s81, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s43, 28 +; GFX11-NEXT: s_lshr_b32 s43, s25, 24 +; GFX11-NEXT: s_lshr_b32 s82, s15, 16 +; GFX11-NEXT: s_lshr_b32 s83, s15, 8 +; GFX11-NEXT: s_lshr_b32 s84, s14, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 29 +; GFX11-NEXT: s_lshr_b32 s43, s25, 16 +; GFX11-NEXT: s_lshr_b32 s85, s14, 8 +; GFX11-NEXT: s_lshr_b32 s86, s17, 24 +; GFX11-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-NEXT: v_writelane_b32 v37, s43, 30 +; GFX11-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-NEXT: v_writelane_b32 v37, s43, 31 +; GFX11-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-NEXT: s_lshr_b32 s97, s19, 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 0 +; GFX11-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 14 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s74, s28, 16 +; GFX11-NEXT: v_writelane_b32 v36, s43, 1 +; GFX11-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-NEXT: v_writelane_b32 v37, s63, 15 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s98, s41, 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 2 +; GFX11-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 12 +; GFX11-NEXT: s_lshr_b32 s99, s41, 16 +; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: v_writelane_b32 v36, s43, 3 +; GFX11-NEXT: s_lshr_b32 s43, s27, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: v_writelane_b32 v36, s43, 4 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 10 +; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s26, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 11 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 6 +; GFX11-NEXT: s_lshr_b32 s43, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 8 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 7 +; GFX11-NEXT: s_lshr_b32 s43, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s63, 9 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 8 +; GFX11-NEXT: s_lshr_b32 s43, s29, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 6 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v36, s43, 9 +; GFX11-NEXT: s_lshr_b32 s43, s28, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 7 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s62, 4 +; GFX11-NEXT: v_writelane_b32 v37, s63, 5 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-NEXT: v_writelane_b32 v37, s62, 2 +; GFX11-NEXT: v_writelane_b32 v37, s63, 3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 0 +; GFX11-NEXT: v_writelane_b32 v37, s63, 1 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: ; GFX11-NEXT: ; implicit-def: $vcc_hi @@ -85199,7 +85474,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -85209,7 +85484,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -85221,7 +85496,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85232,7 +85507,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -85244,7 +85519,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85255,7 +85530,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 @@ -85319,20 +85594,20 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr104 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB57_3: ; %Flow ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 ; GFX11-NEXT: s_mov_b32 s101, s104 @@ -85341,26 +85616,22 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 s69, s42 ; GFX11-NEXT: s_cbranch_vccnz .LBB57_5 ; GFX11-NEXT: ; %bb.4: ; %cmp.true -; GFX11-NEXT: s_add_u32 s0, s0, 3 -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 3 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_add_u32 s16, s16, 3 -; GFX11-NEXT: s_addc_u32 s17, s17, 0 -; GFX11-NEXT: s_add_u32 s18, s18, 3 -; GFX11-NEXT: s_addc_u32 s19, s19, 0 -; GFX11-NEXT: s_add_u32 s20, s20, 3 -; GFX11-NEXT: s_addc_u32 s21, s21, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 ; GFX11-NEXT: s_add_u32 s40, s40, 3 ; GFX11-NEXT: s_addc_u32 s41, s41, 0 +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 ; GFX11-NEXT: s_add_u32 s14, s14, 3 ; GFX11-NEXT: s_addc_u32 s15, s15, 0 ; GFX11-NEXT: s_add_u32 s12, s12, 3 @@ -85373,557 +85644,562 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s7, s7, 0 ; GFX11-NEXT: s_add_u32 s4, s4, 3 ; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v37, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-NEXT: s_lshr_b32 s102, s1, 16 +; GFX11-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-NEXT: s_lshr_b32 s47, s0, 8 +; GFX11-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 +; GFX11-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-NEXT: s_lshr_b32 s56, s2, 8 +; GFX11-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-NEXT: v_writelane_b32 v37, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s58, s10, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 8 +; GFX11-NEXT: s_lshr_b32 s81, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-NEXT: s_lshr_b32 s82, s15, 16 +; GFX11-NEXT: s_lshr_b32 s83, s15, 8 +; GFX11-NEXT: s_lshr_b32 s84, s14, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-NEXT: s_lshr_b32 s85, s14, 8 +; GFX11-NEXT: s_lshr_b32 s86, s17, 24 +; GFX11-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-NEXT: v_writelane_b32 v37, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-NEXT: v_writelane_b32 v37, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-NEXT: s_lshr_b32 s97, s19, 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 14 +; GFX11-NEXT: s_lshr_b32 s69, s19, 16 +; GFX11-NEXT: s_lshr_b32 s74, s28, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-NEXT: v_writelane_b32 v37, s63, 15 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s43, s28, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 12 +; GFX11-NEXT: s_lshr_b32 s98, s41, 24 +; GFX11-NEXT: s_lshr_b32 s99, s41, 16 +; GFX11-NEXT: v_writelane_b32 v36, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-NEXT: v_writelane_b32 v37, s62, 10 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: v_writelane_b32 v36, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-NEXT: v_writelane_b32 v37, s63, 11 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s63, 9 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-NEXT: v_writelane_b32 v37, s62, 6 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v36, s42, 9 +; GFX11-NEXT: v_writelane_b32 v37, s63, 7 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s62, 4 +; GFX11-NEXT: v_writelane_b32 v37, s63, 5 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-NEXT: v_writelane_b32 v37, s62, 2 +; GFX11-NEXT: v_writelane_b32 v37, s63, 3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v37, s62, 0 +; GFX11-NEXT: v_writelane_b32 v37, s63, 1 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: .LBB57_5: ; %end ; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s28, s28, 0xff ; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_or_b32 s28, s28, s43 ; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s28, s28, 0xffff +; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-NEXT: v_readlane_b32 s43, v36, 7 +; GFX11-NEXT: s_or_b32 s28, s28, s42 +; GFX11-NEXT: v_readlane_b32 s42, v36, 9 +; GFX11-NEXT: s_and_b32 s26, s26, 0xff +; GFX11-NEXT: s_and_b32 s27, s27, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_and_b32 s24, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: s_or_b32 s29, s29, s42 +; GFX11-NEXT: v_readlane_b32 s42, v36, 8 +; GFX11-NEXT: s_and_b32 s29, s29, 0xffff +; GFX11-NEXT: s_and_b32 s22, s22, 0xff +; GFX11-NEXT: s_and_b32 s23, s23, 0xff +; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_and_b32 s21, s21, 0xff ; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_lshl_b32 s45, s45, 8 ; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 +; GFX11-NEXT: s_and_b32 s40, s40, 0xff +; GFX11-NEXT: s_or_b32 s29, s29, s42 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29 +; GFX11-NEXT: v_readlane_b32 s28, v36, 6 +; GFX11-NEXT: v_readlane_b32 s29, v36, 5 +; GFX11-NEXT: s_or_b32 s40, s40, s45 ; GFX11-NEXT: s_lshl_b32 s45, s30, 8 ; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: s_or_b32 s26, s26, s28 +; GFX11-NEXT: s_lshl_b32 s28, s92, 8 +; GFX11-NEXT: s_and_b32 s26, s26, 0xffff +; GFX11-NEXT: s_or_b32 s28, s29, s28 +; GFX11-NEXT: v_readlane_b32 s29, v36, 2 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s26, s26, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 4 +; GFX11-NEXT: s_lshl_b32 s29, s29, 8 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_and_b32 s40, s40, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 -; GFX11-NEXT: s_or_b32 s0, s0, s44 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_or_b32 s40, s40, s44 +; GFX11-NEXT: s_or_b32 s27, s27, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 3 +; GFX11-NEXT: s_and_b32 s27, s27, 0xffff +; GFX11-NEXT: s_and_b32 s41, s41, 0xff ; GFX11-NEXT: s_lshl_b32 s44, s100, 8 ; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s44 +; GFX11-NEXT: s_and_b32 s28, s28, 0xff +; GFX11-NEXT: s_or_b32 s41, s41, s44 +; GFX11-NEXT: s_or_b32 s28, s28, s29 +; GFX11-NEXT: v_readlane_b32 s29, v36, 0 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 ; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_or_b32 s27, s27, s28 +; GFX11-NEXT: v_readlane_b32 s28, v36, 1 +; GFX11-NEXT: s_and_b32 s29, s29, 0xff +; GFX11-NEXT: v_dual_mov_b32 v5, s26 :: v_dual_mov_b32 v6, s27 +; GFX11-NEXT: v_readlane_b32 s26, v37, 19 +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_or_b32 s24, s24, s28 +; GFX11-NEXT: s_lshl_b32 s28, s90, 8 +; GFX11-NEXT: s_and_b32 s24, s24, 0xffff +; GFX11-NEXT: s_or_b32 s28, s29, s28 +; GFX11-NEXT: v_readlane_b32 s29, v37, 29 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_lshl_b32 s26, s26, 8 +; GFX11-NEXT: s_or_b32 s24, s24, s28 +; GFX11-NEXT: v_readlane_b32 s28, v37, 31 +; GFX11-NEXT: s_lshl_b32 s29, s29, 8 +; GFX11-NEXT: s_and_b32 s19, s19, 0xff +; GFX11-NEXT: s_and_b32 s41, s41, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_or_b32 s1, s1, s44 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s90, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: s_lshl_b32 s28, s28, 8 +; GFX11-NEXT: s_or_b32 s41, s41, s44 +; GFX11-NEXT: s_or_b32 s25, s25, s28 +; GFX11-NEXT: v_readlane_b32 s28, v37, 30 +; GFX11-NEXT: s_and_b32 s25, s25, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 ; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 -; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 -; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: s_and_b32 s28, s28, 0xff +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_or_b32 s28, s28, s29 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_or_b32 s25, s25, s28 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: v_dual_mov_b32 v7, s24 :: v_dual_mov_b32 v8, s25 +; GFX11-NEXT: v_readlane_b32 s24, v37, 28 +; GFX11-NEXT: v_readlane_b32 s25, v37, 27 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: s_lshl_b32 s24, s78, 8 +; GFX11-NEXT: s_and_b32 s22, s22, 0xffff +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: v_readlane_b32 s25, v37, 24 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 26 +; GFX11-NEXT: s_lshl_b32 s25, s25, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_or_b32 s23, s23, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 25 +; GFX11-NEXT: s_and_b32 s23, s23, 0xffff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s24, s24, 0xff +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_or_b32 s24, s24, s25 +; GFX11-NEXT: v_readlane_b32 s25, v37, 22 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s22, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s23, s23, s24 +; GFX11-NEXT: v_readlane_b32 s24, v37, 23 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s23 +; GFX11-NEXT: s_lshl_b32 s22, s88, 8 +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_lshl_b32 s23, s97, 8 +; GFX11-NEXT: s_or_b32 s20, s20, s24 +; GFX11-NEXT: s_lshl_b32 s24, s62, 8 +; GFX11-NEXT: s_and_b32 s20, s20, 0xffff +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: v_readlane_b32 s25, v37, 21 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s20, s20, s24 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s25, s25, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s21, s21, s25 +; GFX11-NEXT: v_readlane_b32 s25, v37, 20 +; GFX11-NEXT: s_and_b32 s21, s21, 0xffff +; GFX11-NEXT: v_readlane_b32 s100, v35, 4 +; GFX11-NEXT: v_readlane_b32 s99, v35, 3 +; GFX11-NEXT: v_readlane_b32 s98, v35, 2 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: v_readlane_b32 s97, v35, 1 +; GFX11-NEXT: s_or_b32 s25, s25, s26 +; GFX11-NEXT: v_readlane_b32 s31, v34, 1 +; GFX11-NEXT: s_lshl_b32 s24, s25, 16 +; GFX11-NEXT: v_readlane_b32 s30, v34, 0 +; GFX11-NEXT: s_or_b32 s21, s21, s24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: s_and_b32 s16, s23, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 -; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_readlane_b32 s20, v37, 18 +; GFX11-NEXT: v_readlane_b32 s21, v37, 17 +; GFX11-NEXT: s_lshl_b32 s20, s20, 8 +; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_or_b32 s20, s21, s22 +; GFX11-NEXT: v_readlane_b32 s21, v37, 16 +; GFX11-NEXT: s_and_b32 s22, s69, 0xff +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: v_readlane_b32 s69, v34, 21 +; GFX11-NEXT: s_lshl_b32 s21, s21, 8 +; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_or_b32 s19, s19, s21 +; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: s_and_b32 s19, s19, 0xffff +; GFX11-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-NEXT: s_lshl_b32 s20, s96, 8 +; GFX11-NEXT: s_or_b32 s19, s19, s21 +; GFX11-NEXT: s_and_b32 s21, s73, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s76, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s20 +; GFX11-NEXT: s_or_b32 s20, s21, s22 +; GFX11-NEXT: s_lshl_b32 s21, s87, 8 +; GFX11-NEXT: s_and_b32 s22, s72, 0xff +; GFX11-NEXT: s_lshl_b32 s23, s86, 8 +; GFX11-NEXT: s_or_b32 s17, s17, s21 +; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_readlane_b32 s18, v37, 0 +; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: s_and_b32 s17, s17, 0xffff +; GFX11-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-NEXT: s_or_b32 s16, s16, s20 +; GFX11-NEXT: s_or_b32 s17, s17, s21 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 -; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 -; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 -; GFX11-NEXT: s_lshl_b32 s17, s76, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s17 +; GFX11-NEXT: s_lshl_b32 s16, s85, 8 +; GFX11-NEXT: s_and_b32 s17, s84, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: v_readlane_b32 s19, v37, 1 +; GFX11-NEXT: s_or_b32 s14, s14, s16 +; GFX11-NEXT: s_or_b32 s16, s17, s18 +; GFX11-NEXT: s_lshl_b32 s17, s83, 8 +; GFX11-NEXT: s_and_b32 s18, s82, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s81, 8 +; GFX11-NEXT: s_or_b32 s15, s15, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: v_readlane_b32 s18, v37, 2 +; GFX11-NEXT: s_and_b32 s14, s14, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_and_b32 s15, s15, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-NEXT: s_or_b32 s14, s14, s16 +; GFX11-NEXT: s_or_b32 s15, s15, s17 +; GFX11-NEXT: s_lshl_b32 s16, s61, 8 +; GFX11-NEXT: s_and_b32 s17, s80, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-NEXT: v_readlane_b32 s19, v37, 3 +; GFX11-NEXT: s_or_b32 s12, s12, s16 +; GFX11-NEXT: s_or_b32 s16, s17, s18 ; GFX11-NEXT: s_lshl_b32 s17, s60, 8 ; GFX11-NEXT: s_and_b32 s18, s71, 0xff ; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s13, s13, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v5, s14 :: v_dual_mov_b32 v6, s15 +; GFX11-NEXT: v_readlane_b32 s14, v37, 4 +; GFX11-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-NEXT: s_or_b32 s12, s12, s16 +; GFX11-NEXT: s_or_b32 s13, s13, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s13 +; GFX11-NEXT: s_lshl_b32 s12, s58, 8 +; GFX11-NEXT: s_and_b32 s13, s59, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: v_readlane_b32 s15, v37, 5 +; GFX11-NEXT: s_or_b32 s10, s10, s12 +; GFX11-NEXT: s_or_b32 s12, s13, s14 +; GFX11-NEXT: s_lshl_b32 s13, s68, 8 ; GFX11-NEXT: s_and_b32 s14, s67, 0xff ; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-NEXT: s_or_b32 s11, s11, s13 +; GFX11-NEXT: s_or_b32 s13, s14, s15 +; GFX11-NEXT: v_readlane_b32 s14, v37, 6 +; GFX11-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_and_b32 s11, s11, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_or_b32 s10, s10, s12 +; GFX11-NEXT: s_or_b32 s11, s11, s13 +; GFX11-NEXT: s_lshl_b32 s12, s65, 8 +; GFX11-NEXT: s_and_b32 s13, s64, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s14 -; GFX11-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-NEXT: v_readlane_b32 s15, v37, 7 +; GFX11-NEXT: s_or_b32 s8, s8, s12 +; GFX11-NEXT: s_or_b32 s12, s13, s14 ; GFX11-NEXT: s_lshl_b32 s13, s55, 8 ; GFX11-NEXT: s_and_b32 s14, s54, 0xff ; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s9, s9, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xffff -; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 +; GFX11-NEXT: v_readlane_b32 s10, v37, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_or_b32 s8, s8, s12 +; GFX11-NEXT: s_or_b32 s9, s9, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 +; GFX11-NEXT: s_lshl_b32 s8, s52, 8 +; GFX11-NEXT: s_and_b32 s9, s51, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: v_readlane_b32 s11, v37, 9 +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_lshl_b32 s9, s50, 8 ; GFX11-NEXT: s_and_b32 s10, s49, 0xff ; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: v_readlane_b32 s10, v37, 10 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_lshl_b32 s8, s39, 8 +; GFX11-NEXT: s_and_b32 s9, s38, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s10 -; GFX11-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-NEXT: v_readlane_b32 s11, v37, 11 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 ; GFX11-NEXT: s_lshl_b32 s9, s37, 8 ; GFX11-NEXT: s_and_b32 s10, s36, 0xff ; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s5, s5, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-NEXT: v_readlane_b32 s6, v37, 12 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX11-NEXT: s_lshl_b32 s4, s56, 8 +; GFX11-NEXT: s_and_b32 s5, s57, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: v_readlane_b32 s7, v37, 13 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 +; GFX11-NEXT: s_lshl_b32 s5, s34, 8 ; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff ; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: v_readlane_b32 s6, v37, 14 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_lshl_b32 s4, s47, 8 +; GFX11-NEXT: s_and_b32 s5, s104, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s6 -; GFX11-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-NEXT: v_readlane_b32 s7, v37, 15 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s4, s5, s6 ; GFX11-NEXT: s_lshl_b32 s5, s103, 8 ; GFX11-NEXT: s_and_b32 s6, s102, 0xff ; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v17, 8 -; GFX11-NEXT: v_readlane_b32 s103, v17, 7 -; GFX11-NEXT: v_readlane_b32 s102, v17, 6 -; GFX11-NEXT: v_readlane_b32 s101, v17, 5 -; GFX11-NEXT: v_readlane_b32 s96, v17, 0 -; GFX11-NEXT: v_readlane_b32 s87, v16, 31 -; GFX11-NEXT: v_readlane_b32 s85, v16, 29 -; GFX11-NEXT: v_readlane_b32 s84, v16, 28 -; GFX11-NEXT: v_readlane_b32 s83, v16, 27 -; GFX11-NEXT: v_readlane_b32 s82, v16, 26 -; GFX11-NEXT: v_readlane_b32 s81, v16, 25 -; GFX11-NEXT: v_readlane_b32 s80, v16, 24 -; GFX11-NEXT: v_readlane_b32 s71, v16, 23 -; GFX11-NEXT: v_readlane_b32 s70, v16, 22 -; GFX11-NEXT: v_readlane_b32 s68, v16, 20 -; GFX11-NEXT: v_readlane_b32 s67, v16, 19 -; GFX11-NEXT: v_readlane_b32 s66, v16, 18 -; GFX11-NEXT: v_readlane_b32 s65, v16, 17 -; GFX11-NEXT: v_readlane_b32 s64, v16, 16 -; GFX11-NEXT: v_readlane_b32 s55, v16, 15 -; GFX11-NEXT: v_readlane_b32 s54, v16, 14 -; GFX11-NEXT: v_readlane_b32 s53, v16, 13 -; GFX11-NEXT: v_readlane_b32 s52, v16, 12 -; GFX11-NEXT: v_readlane_b32 s51, v16, 11 -; GFX11-NEXT: v_readlane_b32 s50, v16, 10 -; GFX11-NEXT: v_readlane_b32 s49, v16, 9 -; GFX11-NEXT: v_readlane_b32 s48, v16, 8 -; GFX11-NEXT: v_readlane_b32 s39, v16, 7 -; GFX11-NEXT: v_readlane_b32 s38, v16, 6 -; GFX11-NEXT: v_readlane_b32 s37, v16, 5 -; GFX11-NEXT: v_readlane_b32 s36, v16, 4 -; GFX11-NEXT: v_readlane_b32 s35, v16, 3 -; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: v_readlane_b32 s104, v35, 8 +; GFX11-NEXT: v_readlane_b32 s103, v35, 7 +; GFX11-NEXT: v_readlane_b32 s102, v35, 6 +; GFX11-NEXT: v_readlane_b32 s101, v35, 5 +; GFX11-NEXT: v_readlane_b32 s96, v35, 0 +; GFX11-NEXT: v_readlane_b32 s87, v34, 31 +; GFX11-NEXT: v_readlane_b32 s86, v34, 30 +; GFX11-NEXT: v_readlane_b32 s85, v34, 29 +; GFX11-NEXT: v_readlane_b32 s84, v34, 28 +; GFX11-NEXT: v_readlane_b32 s83, v34, 27 +; GFX11-NEXT: v_readlane_b32 s82, v34, 26 +; GFX11-NEXT: v_readlane_b32 s81, v34, 25 +; GFX11-NEXT: v_readlane_b32 s80, v34, 24 +; GFX11-NEXT: v_readlane_b32 s71, v34, 23 +; GFX11-NEXT: v_readlane_b32 s70, v34, 22 +; GFX11-NEXT: v_readlane_b32 s68, v34, 20 +; GFX11-NEXT: v_readlane_b32 s67, v34, 19 +; GFX11-NEXT: v_readlane_b32 s66, v34, 18 +; GFX11-NEXT: v_readlane_b32 s65, v34, 17 +; GFX11-NEXT: v_readlane_b32 s64, v34, 16 +; GFX11-NEXT: v_readlane_b32 s55, v34, 15 +; GFX11-NEXT: v_readlane_b32 s54, v34, 14 +; GFX11-NEXT: v_readlane_b32 s53, v34, 13 +; GFX11-NEXT: v_readlane_b32 s52, v34, 12 +; GFX11-NEXT: v_readlane_b32 s51, v34, 11 +; GFX11-NEXT: v_readlane_b32 s50, v34, 10 +; GFX11-NEXT: v_readlane_b32 s49, v34, 9 +; GFX11-NEXT: v_readlane_b32 s48, v34, 8 +; GFX11-NEXT: v_readlane_b32 s39, v34, 7 +; GFX11-NEXT: v_readlane_b32 s38, v34, 6 +; GFX11-NEXT: v_readlane_b32 s37, v34, 5 +; GFX11-NEXT: v_readlane_b32 s36, v34, 4 +; GFX11-NEXT: v_readlane_b32 s35, v34, 3 +; GFX11-NEXT: v_readlane_b32 s34, v34, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v16, off, s32 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -97795,47 +98071,75 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 -; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s48, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s49, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s50, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s51, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s52, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s53, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s54, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s55, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s64, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s65, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s67, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s68, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_writelane_b32 v21, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s69, v20 ; SI-NEXT: v_readfirstlane_b32 s70, v1 ; SI-NEXT: v_readfirstlane_b32 s71, v2 ; SI-NEXT: v_readfirstlane_b32 s80, v3 @@ -97855,107 +98159,107 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: v_readfirstlane_b32 s8, v17 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s9, v18 -; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: v_writelane_b32 v22, s4, 0 ; SI-NEXT: s_lshl_b32 s4, s9, 16 -; SI-NEXT: v_writelane_b32 v21, s4, 1 +; SI-NEXT: v_writelane_b32 v22, s4, 1 ; SI-NEXT: s_and_b32 s4, s8, 0xffff0000 -; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v22, s4, 2 ; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: v_writelane_b32 v22, s4, 3 ; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s7, 16 ; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s12, s6, 16 ; SI-NEXT: s_and_b32 s15, s99, 0xffff0000 ; SI-NEXT: s_lshl_b32 s14, s99, 16 -; SI-NEXT: s_and_b32 s41, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s98, 16 -; SI-NEXT: s_and_b32 s43, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s97, 16 -; SI-NEXT: s_and_b32 s45, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s96, 16 -; SI-NEXT: s_and_b32 s47, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s87, 16 -; SI-NEXT: s_and_b32 s57, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s86, 16 -; SI-NEXT: s_and_b32 s59, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s85, 16 -; SI-NEXT: s_and_b32 s61, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s84, 16 -; SI-NEXT: s_and_b32 s63, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s83, 16 -; SI-NEXT: s_and_b32 s73, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s82, 16 -; SI-NEXT: s_and_b32 s75, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s81, 16 -; SI-NEXT: s_and_b32 s77, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s80, 16 -; SI-NEXT: s_and_b32 s79, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s71, 16 -; SI-NEXT: s_and_b32 s89, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s70, 16 -; SI-NEXT: s_and_b32 s91, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s29, 16 -; SI-NEXT: s_and_b32 s93, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s28, 16 -; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s27, 16 -; SI-NEXT: s_and_b32 s31, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s26, 16 -; SI-NEXT: s_and_b32 s35, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s25, 16 -; SI-NEXT: s_and_b32 s37, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s24, 16 -; SI-NEXT: s_and_b32 s39, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s23, 16 -; SI-NEXT: s_and_b32 s49, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s22, 16 -; SI-NEXT: s_and_b32 s51, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s21, 16 -; SI-NEXT: s_and_b32 s53, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s20, 16 -; SI-NEXT: s_and_b32 s55, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s19, 16 -; SI-NEXT: s_and_b32 s65, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s18, 16 -; SI-NEXT: s_and_b32 s67, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s17, 16 -; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_and_b32 s17, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s98, 16 +; SI-NEXT: s_and_b32 s19, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s97, 16 +; SI-NEXT: s_and_b32 s21, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s96, 16 +; SI-NEXT: s_and_b32 s23, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s87, 16 +; SI-NEXT: s_and_b32 s25, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s86, 16 +; SI-NEXT: s_and_b32 s27, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s85, 16 +; SI-NEXT: s_and_b32 s29, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s84, 16 +; SI-NEXT: s_and_b32 s41, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s83, 16 +; SI-NEXT: s_and_b32 s43, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s82, 16 +; SI-NEXT: s_and_b32 s45, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s81, 16 +; SI-NEXT: s_and_b32 s47, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s80, 16 +; SI-NEXT: s_and_b32 s57, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s71, 16 +; SI-NEXT: s_and_b32 s59, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s70, 16 +; SI-NEXT: s_and_b32 s61, s69, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s69, 16 +; SI-NEXT: s_and_b32 s63, s68, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s68, 16 +; SI-NEXT: s_and_b32 s73, s67, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s67, 16 +; SI-NEXT: s_and_b32 s75, s66, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s66, 16 +; SI-NEXT: s_and_b32 s77, s65, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s65, 16 +; SI-NEXT: s_and_b32 s79, s64, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s64, 16 +; SI-NEXT: s_and_b32 s89, s55, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s55, 16 +; SI-NEXT: s_and_b32 s91, s54, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s54, 16 +; SI-NEXT: s_and_b32 s93, s53, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s53, 16 +; SI-NEXT: s_and_b32 s95, s52, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s52, 16 +; SI-NEXT: s_and_b32 s31, s51, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s51, 16 +; SI-NEXT: s_and_b32 s35, s50, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s50, 16 +; SI-NEXT: s_and_b32 s37, s49, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s49, 16 +; SI-NEXT: s_and_b32 s39, s48, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s48, 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_add_u32 s16, s18, 3 -; SI-NEXT: s_addc_u32 s17, s19, 0 -; SI-NEXT: s_add_u32 s18, s20, 3 -; SI-NEXT: s_addc_u32 s19, s21, 0 -; SI-NEXT: s_add_u32 s20, s22, 3 -; SI-NEXT: s_addc_u32 s21, s23, 0 -; SI-NEXT: s_add_u32 s22, s24, 3 -; SI-NEXT: s_addc_u32 s23, s25, 0 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s25, s27, 0 -; SI-NEXT: s_add_u32 s26, s28, 3 -; SI-NEXT: s_addc_u32 s27, s29, 0 -; SI-NEXT: s_add_u32 s28, s70, 3 -; SI-NEXT: s_addc_u32 s29, s71, 0 -; SI-NEXT: s_add_u32 s76, s80, 3 -; SI-NEXT: s_addc_u32 s74, s81, 0 -; SI-NEXT: s_add_u32 s72, s82, 3 -; SI-NEXT: s_addc_u32 s62, s83, 0 -; SI-NEXT: s_add_u32 s60, s84, 3 -; SI-NEXT: s_addc_u32 s58, s85, 0 -; SI-NEXT: s_add_u32 s56, s86, 3 -; SI-NEXT: s_addc_u32 s46, s87, 0 -; SI-NEXT: s_add_u32 s44, s96, 3 -; SI-NEXT: s_addc_u32 s42, s97, 0 -; SI-NEXT: s_add_u32 s40, s98, 3 +; SI-NEXT: s_add_u32 s4, s48, 3 +; SI-NEXT: s_addc_u32 s5, s49, 0 +; SI-NEXT: s_add_u32 vcc_lo, s50, 3 +; SI-NEXT: s_addc_u32 vcc_hi, s51, 0 +; SI-NEXT: s_add_u32 s94, s52, 3 +; SI-NEXT: s_addc_u32 s92, s53, 0 +; SI-NEXT: s_add_u32 s90, s54, 3 +; SI-NEXT: s_addc_u32 s88, s55, 0 +; SI-NEXT: s_add_u32 s78, s64, 3 +; SI-NEXT: s_addc_u32 s76, s65, 0 +; SI-NEXT: s_add_u32 s74, s66, 3 +; SI-NEXT: s_addc_u32 s72, s67, 0 +; SI-NEXT: s_add_u32 s62, s68, 3 +; SI-NEXT: s_addc_u32 s60, s69, 0 +; SI-NEXT: s_add_u32 s58, s70, 3 +; SI-NEXT: s_addc_u32 s56, s71, 0 +; SI-NEXT: s_add_u32 s46, s80, 3 +; SI-NEXT: s_addc_u32 s44, s81, 0 +; SI-NEXT: s_add_u32 s42, s82, 3 +; SI-NEXT: s_addc_u32 s40, s83, 0 +; SI-NEXT: s_add_u32 s28, s84, 3 +; SI-NEXT: s_addc_u32 s26, s85, 0 +; SI-NEXT: s_add_u32 s24, s86, 3 +; SI-NEXT: s_addc_u32 s22, s87, 0 +; SI-NEXT: s_add_u32 s20, s96, 3 +; SI-NEXT: s_addc_u32 s18, s97, 0 +; SI-NEXT: s_add_u32 s16, s98, 3 ; SI-NEXT: s_addc_u32 s14, s99, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 ; SI-NEXT: s_addc_u32 s7, s7, 0 @@ -97963,11 +98267,11 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_and_b32 s10, s9, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s10, 0 +; SI-NEXT: v_writelane_b32 v22, s10, 0 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_writelane_b32 v21, s9, 1 +; SI-NEXT: v_writelane_b32 v22, s9, 1 ; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: v_writelane_b32 v21, s9, 2 +; SI-NEXT: v_writelane_b32 v22, s9, 2 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s7, 16 @@ -97975,6 +98279,20 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_lshl_b32 s12, s6, 16 ; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s17, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s19, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s23, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_and_b32 s29, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s41, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_and_b32 s43, s42, 0xffff0000 @@ -97997,225 +98315,211 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_lshl_b32 s74, s74, 16 ; SI-NEXT: s_and_b32 s77, s76, 0xffff0000 ; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s79, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s29, 16 -; SI-NEXT: s_and_b32 s89, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s28, 16 -; SI-NEXT: s_and_b32 s91, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s27, 16 -; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s26, 16 -; SI-NEXT: s_and_b32 s95, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s25, 16 -; SI-NEXT: s_and_b32 s31, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s24, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s23, 16 -; SI-NEXT: s_and_b32 s37, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s22, 16 -; SI-NEXT: s_and_b32 s39, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s21, 16 -; SI-NEXT: s_and_b32 s49, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s20, 16 -; SI-NEXT: s_and_b32 s51, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s19, 16 -; SI-NEXT: s_and_b32 s53, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s18, 16 -; SI-NEXT: s_and_b32 s55, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s17, 16 -; SI-NEXT: s_and_b32 s65, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s16, 16 -; SI-NEXT: s_and_b32 s67, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s5, 16 -; SI-NEXT: s_and_b32 s69, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s8, 3 +; SI-NEXT: s_and_b32 s79, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s78, 16 +; SI-NEXT: s_and_b32 s89, s88, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s88, 16 +; SI-NEXT: s_and_b32 s91, s90, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s90, 16 +; SI-NEXT: s_and_b32 s93, s92, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s92, 16 +; SI-NEXT: s_and_b32 s95, s94, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s94, 16 +; SI-NEXT: s_and_b32 s31, vcc_hi, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, vcc_hi, 16 +; SI-NEXT: s_and_b32 s35, vcc_lo, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, vcc_lo, 16 +; SI-NEXT: s_and_b32 s37, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s5, 16 +; SI-NEXT: s_and_b32 s39, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s4, 16 +; SI-NEXT: v_writelane_b32 v22, s8, 3 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v21, 2 +; SI-NEXT: v_readlane_b32 s4, v22, 2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -98242,80 +98546,66 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v21, 3 +; SI-NEXT: v_readlane_b32 s4, v22, 3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 0 +; SI-NEXT: v_readlane_b32 s4, v22, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v21, 1 +; SI-NEXT: v_readlane_b32 s4, v22, 1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s99, v21, 35 +; SI-NEXT: v_readlane_b32 s98, v21, 34 +; SI-NEXT: v_readlane_b32 s97, v21, 33 +; SI-NEXT: v_readlane_b32 s96, v21, 32 +; SI-NEXT: v_readlane_b32 s87, v21, 31 +; SI-NEXT: v_readlane_b32 s86, v21, 30 +; SI-NEXT: v_readlane_b32 s85, v21, 29 +; SI-NEXT: v_readlane_b32 s84, v21, 28 +; SI-NEXT: v_readlane_b32 s83, v21, 27 +; SI-NEXT: v_readlane_b32 s82, v21, 26 +; SI-NEXT: v_readlane_b32 s81, v21, 25 +; SI-NEXT: v_readlane_b32 s80, v21, 24 +; SI-NEXT: v_readlane_b32 s71, v21, 23 +; SI-NEXT: v_readlane_b32 s70, v21, 22 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -98356,6 +98646,20 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 @@ -106494,20 +106798,48 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_readfirstlane_b32 s27, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s26, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s46, v1 -; SI-NEXT: v_readfirstlane_b32 s47, v2 -; SI-NEXT: v_readfirstlane_b32 s44, v3 -; SI-NEXT: v_readfirstlane_b32 s45, v4 -; SI-NEXT: v_readfirstlane_b32 s42, v5 -; SI-NEXT: v_readfirstlane_b32 s43, v6 -; SI-NEXT: v_readfirstlane_b32 s40, v7 -; SI-NEXT: v_readfirstlane_b32 s41, v8 +; SI-NEXT: v_readfirstlane_b32 s28, v20 +; SI-NEXT: v_readfirstlane_b32 s22, v1 +; SI-NEXT: v_readfirstlane_b32 s23, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v3 +; SI-NEXT: v_readfirstlane_b32 s21, v4 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_readfirstlane_b32 s17, v8 ; SI-NEXT: v_readfirstlane_b32 s14, v9 ; SI-NEXT: v_readfirstlane_b32 s15, v10 ; SI-NEXT: v_readfirstlane_b32 s12, v11 @@ -106560,49 +106892,49 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s56, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s47, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s46, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 @@ -106613,75 +106945,75 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: s_lshr_b32 s17, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s56, s18, 16 -; SI-NEXT: s_lshr_b32 s57, s19, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s58, s20, 16 -; SI-NEXT: s_lshr_b32 s59, s21, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s60, s22, 16 -; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_add_u32 s4, s40, 3 +; SI-NEXT: s_addc_u32 s5, s44, 0 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s44, s45, 0 +; SI-NEXT: s_lshr_b32 s45, s41, 16 +; SI-NEXT: s_lshr_b32 s57, s44, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s46, s46, 0 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s46, 16 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s60, s43, 16 +; SI-NEXT: s_lshr_b32 s61, s47, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_addc_u32 s56, s56, 0 ; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_lshr_b32 s63, s56, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s72, s26, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 ; SI-NEXT: s_lshr_b32 s73, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s74, s28, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_add_u32 s46, s46, 3 -; SI-NEXT: s_addc_u32 s47, s47, 0 -; SI-NEXT: s_lshr_b32 s76, s46, 16 -; SI-NEXT: s_lshr_b32 s77, s47, 16 -; SI-NEXT: s_add_u32 s44, s44, 3 -; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_lshr_b32 s78, s44, 16 -; SI-NEXT: s_lshr_b32 s79, s45, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s43, s43, 0 -; SI-NEXT: s_lshr_b32 s88, s42, 16 -; SI-NEXT: s_lshr_b32 s89, s43, 16 -; SI-NEXT: s_add_u32 s40, s40, 3 -; SI-NEXT: s_addc_u32 s41, s41, 0 -; SI-NEXT: s_lshr_b32 s90, s40, 16 -; SI-NEXT: s_lshr_b32 s91, s41, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s28, s28, 0 +; SI-NEXT: s_lshr_b32 s74, s26, 16 +; SI-NEXT: s_lshr_b32 s75, s28, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s76, s22, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s78, s20, 16 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s88, s18, 16 +; SI-NEXT: s_lshr_b32 s89, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s90, s16, 16 +; SI-NEXT: s_lshr_b32 s91, s17, 16 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_lshr_b32 s92, s14, 16 @@ -106716,26 +107048,26 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -106767,9 +107099,9 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 ; SI-NEXT: .LBB65_3: ; %end ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 @@ -110066,37 +110398,65 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: v_readfirstlane_b32 s14, v7 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 @@ -110108,9 +110468,9 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v21, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 16 @@ -110119,32 +110479,32 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s49, s11, 16 ; SI-NEXT: s_lshr_b32 s50, s13, 16 ; SI-NEXT: s_lshr_b32 s51, s15, 16 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s53, s43, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 16 -; SI-NEXT: s_lshr_b32 s55, s29, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 16 -; SI-NEXT: s_lshr_b32 s67, s21, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 16 -; SI-NEXT: s_lshr_b32 s69, s17, 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -110159,186 +110519,186 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_add_u32 s40, s40, 3 ; SI-NEXT: s_addc_u32 s41, s41, 0 ; SI-NEXT: s_add_u32 s42, s42, 3 ; SI-NEXT: s_addc_u32 s43, s43, 0 ; SI-NEXT: s_add_u32 s44, s44, 3 ; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_add_u32 s56, s56, 3 +; SI-NEXT: s_addc_u32 s57, s57, 0 ; SI-NEXT: s_lshr_b32 s38, s5, 16 ; SI-NEXT: s_lshr_b32 s39, s7, 16 ; SI-NEXT: s_lshr_b32 s48, s9, 16 ; SI-NEXT: s_lshr_b32 s49, s11, 16 ; SI-NEXT: s_lshr_b32 s50, s13, 16 ; SI-NEXT: s_lshr_b32 s51, s15, 16 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s53, s43, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 16 -; SI-NEXT: s_lshr_b32 s55, s29, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 16 -; SI-NEXT: s_lshr_b32 s67, s21, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 16 -; SI-NEXT: s_lshr_b32 s69, s17, 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_lshl_b32 s47, s36, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s47 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s69, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s34, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s68, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s30, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xffff +; SI-NEXT: s_lshl_b32 s29, s69, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s34, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v5, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s67, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s94, 16 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s66, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s66, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s92, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s65, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s64, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s55, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xffff -; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s54, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s53, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -110402,7 +110762,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -110416,7 +110776,7 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s46, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -110430,30 +110790,30 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -110488,8 +110848,8 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: @@ -111989,23 +112349,51 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 ; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 ; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 ; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 ; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 ; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 ; VI-NEXT: v_readfirstlane_b32 s15, v11 -; VI-NEXT: v_readfirstlane_b32 s40, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v13 -; VI-NEXT: v_readfirstlane_b32 s42, v14 -; VI-NEXT: v_readfirstlane_b32 s43, v15 -; VI-NEXT: v_readfirstlane_b32 s44, v16 -; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s17, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: v_readfirstlane_b32 s24, v3 +; VI-NEXT: v_readfirstlane_b32 s25, v4 +; VI-NEXT: v_readfirstlane_b32 s26, v5 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_readfirstlane_b32 s43, v12 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_readfirstlane_b32 s45, v14 ; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s47, v1 @@ -112022,8 +112410,38 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s5, s45, 3 ; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -112092,38 +112510,8 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s45, 3 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s44, 3 -; VI-NEXT: s_add_i32 s45, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s43, 3 -; VI-NEXT: s_add_i32 s44, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s42, 3 -; VI-NEXT: s_add_i32 s43, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s41, 3 -; VI-NEXT: s_add_i32 s42, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s40, 3 -; VI-NEXT: s_add_i32 s41, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s15, 3 -; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -112174,20 +112562,20 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB71_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s22 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s24 +; VI-NEXT: v_mov_b32_e32 v3, s25 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: v_mov_b32_e32 v6, s28 +; VI-NEXT: v_mov_b32_e32 v7, s29 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s41 +; VI-NEXT: v_mov_b32_e32 v10, s42 +; VI-NEXT: v_mov_b32_e32 v11, s43 +; VI-NEXT: v_mov_b32_e32 v12, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s46 ; VI-NEXT: v_mov_b32_e32 v15, s47 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -112200,12 +112588,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v23, s13 ; VI-NEXT: v_mov_b32_e32 v24, s14 ; VI-NEXT: v_mov_b32_e32 v25, s15 -; VI-NEXT: v_mov_b32_e32 v26, s40 -; VI-NEXT: v_mov_b32_e32 v27, s41 -; VI-NEXT: v_mov_b32_e32 v28, s42 -; VI-NEXT: v_mov_b32_e32 v29, s43 -; VI-NEXT: v_mov_b32_e32 v30, s44 -; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: v_mov_b32_e32 v26, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v28, s18 +; VI-NEXT: v_mov_b32_e32 v29, s19 +; VI-NEXT: v_mov_b32_e32 v30, s20 +; VI-NEXT: v_mov_b32_e32 v31, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: ; VI-NEXT: s_branch .LBB71_2 @@ -116844,9 +117232,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -116871,40 +117259,68 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s22, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v20 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 ; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s14, v11 -; SI-NEXT: v_readfirstlane_b32 s15, v12 -; SI-NEXT: v_readfirstlane_b32 s40, v13 -; SI-NEXT: v_readfirstlane_b32 s41, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s45, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -116918,488 +117334,463 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s46, s45, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 34 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 35 -; SI-NEXT: s_lshr_b32 s46, s45, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 36 -; SI-NEXT: s_lshr_b32 s46, s43, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 37 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 38 -; SI-NEXT: s_lshr_b32 s46, s43, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 39 -; SI-NEXT: s_lshr_b32 s46, s41, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 40 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 41 -; SI-NEXT: s_lshr_b32 s46, s41, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 42 -; SI-NEXT: s_lshr_b32 s46, s15, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 43 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 44 -; SI-NEXT: s_lshr_b32 s46, s15, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 45 -; SI-NEXT: s_lshr_b32 s46, s13, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 46 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 47 -; SI-NEXT: s_lshr_b32 s46, s13, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 48 -; SI-NEXT: s_lshr_b32 s46, s11, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 49 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 50 -; SI-NEXT: s_lshr_b32 s46, s11, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 51 -; SI-NEXT: s_lshr_b32 s46, s9, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 52 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 53 -; SI-NEXT: s_lshr_b32 s46, s9, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 54 -; SI-NEXT: s_lshr_b32 s46, s7, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 55 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 56 -; SI-NEXT: s_lshr_b32 s46, s7, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 57 -; SI-NEXT: s_lshr_b32 s46, s5, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 58 -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 59 -; SI-NEXT: s_lshr_b32 s46, s5, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 60 -; SI-NEXT: s_lshr_b32 s46, s29, 24 -; SI-NEXT: v_writelane_b32 v61, s46, 61 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_writelane_b32 v61, s46, 62 -; SI-NEXT: s_lshr_b32 s46, s29, 8 -; SI-NEXT: v_writelane_b32 v61, s46, 63 -; SI-NEXT: s_lshr_b32 s46, s27, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 0 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 1 -; SI-NEXT: s_lshr_b32 s46, s27, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: s_lshr_b32 s46, s25, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 3 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: s_lshr_b32 s46, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 5 -; SI-NEXT: s_lshr_b32 s46, s23, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 6 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 7 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 8 -; SI-NEXT: s_lshr_b32 s46, s21, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 9 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 10 -; SI-NEXT: s_lshr_b32 s46, s21, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 11 -; SI-NEXT: s_lshr_b32 s46, s19, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 12 -; SI-NEXT: s_lshr_b32 s46, s19, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 13 -; SI-NEXT: s_lshr_b32 s46, s19, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 14 -; SI-NEXT: s_lshr_b32 s46, s17, 24 -; SI-NEXT: v_writelane_b32 v62, s46, 15 -; SI-NEXT: s_lshr_b32 s46, s17, 16 -; SI-NEXT: v_writelane_b32 v62, s46, 16 -; SI-NEXT: s_lshr_b32 s46, s17, 8 -; SI-NEXT: v_writelane_b32 v62, s46, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 32 -; SI-NEXT: v_writelane_b32 v61, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 30 -; SI-NEXT: v_writelane_b32 v61, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 28 -; SI-NEXT: v_writelane_b32 v61, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 26 -; SI-NEXT: v_writelane_b32 v61, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 24 -; SI-NEXT: v_writelane_b32 v61, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 22 -; SI-NEXT: v_writelane_b32 v61, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 20 -; SI-NEXT: v_writelane_b32 v61, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 18 -; SI-NEXT: v_writelane_b32 v61, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 16 -; SI-NEXT: v_writelane_b32 v61, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 14 -; SI-NEXT: v_writelane_b32 v61, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 12 -; SI-NEXT: v_writelane_b32 v61, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 10 -; SI-NEXT: v_writelane_b32 v61, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 8 -; SI-NEXT: v_writelane_b32 v61, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 6 -; SI-NEXT: v_writelane_b32 v61, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v61, s46, 4 -; SI-NEXT: v_writelane_b32 v61, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v61, s46, 2 -; SI-NEXT: v_writelane_b32 v61, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v61, s46, 0 -; SI-NEXT: s_lshr_b64 s[48:49], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v61, s47, 1 -; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s26, s5, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 34 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 35 +; SI-NEXT: s_lshr_b32 s26, s5, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 36 +; SI-NEXT: s_lshr_b32 s26, s7, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 37 +; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 38 +; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 39 +; SI-NEXT: s_lshr_b32 s26, s9, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 40 +; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 41 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 42 +; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 43 +; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 44 +; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 45 +; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 46 +; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 47 +; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 48 +; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 49 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 50 +; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 51 +; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 52 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 53 +; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 54 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 55 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 56 +; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 57 +; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 58 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 59 +; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 60 +; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: v_writelane_b32 v62, s26, 61 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: v_writelane_b32 v62, s26, 62 +; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: v_writelane_b32 v62, s26, 63 +; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 0 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 1 +; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 2 +; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 3 +; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 4 +; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 5 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 6 +; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 7 +; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 8 +; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 9 +; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 10 +; SI-NEXT: s_lshr_b32 s26, s45, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 11 +; SI-NEXT: s_lshr_b32 s26, s47, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 12 +; SI-NEXT: s_lshr_b32 s26, s47, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 13 +; SI-NEXT: s_lshr_b32 s26, s47, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 14 +; SI-NEXT: s_lshr_b32 s26, s57, 24 +; SI-NEXT: v_writelane_b32 v61, s26, 15 +; SI-NEXT: s_lshr_b32 s26, s57, 16 +; SI-NEXT: v_writelane_b32 v61, s26, 16 +; SI-NEXT: s_lshr_b32 s26, s57, 8 +; SI-NEXT: v_writelane_b32 v61, s26, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 32 +; SI-NEXT: v_writelane_b32 v62, s27, 33 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 30 +; SI-NEXT: v_writelane_b32 v62, s27, 31 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 28 +; SI-NEXT: v_writelane_b32 v62, s27, 29 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v62, s26, 26 +; SI-NEXT: v_writelane_b32 v62, s27, 27 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 24 +; SI-NEXT: v_writelane_b32 v62, s27, 25 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 22 +; SI-NEXT: v_writelane_b32 v62, s27, 23 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, s26, 20 +; SI-NEXT: v_writelane_b32 v62, s27, 21 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 18 +; SI-NEXT: v_writelane_b32 v62, s27, 19 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 16 +; SI-NEXT: v_writelane_b32 v62, s27, 17 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v62, s26, 14 +; SI-NEXT: v_writelane_b32 v62, s27, 15 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 12 +; SI-NEXT: v_writelane_b32 v62, s27, 13 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 10 +; SI-NEXT: v_writelane_b32 v62, s27, 11 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v62, s26, 8 +; SI-NEXT: v_writelane_b32 v62, s27, 9 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 6 +; SI-NEXT: v_writelane_b32 v62, s27, 7 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v62, s26, 4 +; SI-NEXT: v_writelane_b32 v62, s27, 5 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v62, s26, 2 +; SI-NEXT: v_writelane_b32 v62, s27, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v62, s26, 0 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v62, s27, 1 +; SI-NEXT: s_lshr_b64 s[50:51], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v8 -; SI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v10 -; SI-NEXT: v_add_f64 v[11:12], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v12 -; SI-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v12 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v14 -; SI-NEXT: v_add_f64 v[15:16], s[6:7], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v14 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 -; SI-NEXT: v_add_f64 v[17:18], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 -; SI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v20 -; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v20 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v22 -; SI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v22 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_add_f64 v[38:39], s[22:23], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v39 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_add_f64 v[52:53], s[20:21], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v39 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v53 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 -; SI-NEXT: v_add_f64 v[44:45], s[18:19], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v53 -; SI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v45 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v45 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 8 -; SI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[30:31], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 24 +; SI-NEXT: v_lshr_b64 v[10:11], v[7:8], 24 +; SI-NEXT: v_add_f64 v[54:55], s[40:41], 1.0 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[54:55], 24 +; SI-NEXT: v_add_f64 v[42:43], s[42:43], 1.0 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[54:55], 16 +; SI-NEXT: v_add_f64 v[37:38], s[4:5], 1.0 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[42:43], 16 +; SI-NEXT: v_add_f64 v[56:57], s[44:45], 1.0 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[42:43], 8 +; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 8 +; SI-NEXT: v_add_f64 v[35:36], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[46:47], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 8 +; SI-NEXT: v_add_f64 v[3:4], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[56:57], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[47:48], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[1:2], v[47:48], 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[32:33], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 -; SI-NEXT: v_add_f64 v[58:59], s[16:17], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 8 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v2 -; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 24 -; SI-NEXT: v_lshr_b64 v[49:50], v[38:39], 24 -; SI-NEXT: v_lshr_b64 v[40:41], v[38:39], 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[44:45], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_lshr_b64 v[50:51], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[52:53], 24 -; SI-NEXT: v_lshr_b64 v[54:55], v[52:53], 8 -; SI-NEXT: v_lshr_b64 v[26:27], v[58:59], 24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v2 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_lshr_b64 v[42:43], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[55:56], v[44:45], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_lshr_b64 v[56:57], v[44:45], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[58:59], 8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v45 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[47:48], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[13:14], 16 +; SI-NEXT: v_add_f64 v[21:22], s[20:21], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[15:16], v[32:33], 8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[26:27], v[28:29], 8 +; SI-NEXT: v_readfirstlane_b32 s11, v48 +; SI-NEXT: v_lshr_b64 v[51:52], v[47:48], 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[16:17], v[28:29], 16 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 8 +; SI-NEXT: v_readfirstlane_b32 s19, v29 +; SI-NEXT: v_lshr_b64 v[28:29], v[30:31], 16 +; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_lshr_b64 v[52:53], v[30:31], 8 +; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_readfirstlane_b32 s43, v43 +; SI-NEXT: v_readfirstlane_b32 s5, v38 +; SI-NEXT: v_lshr_b64 v[38:39], v[42:43], 24 +; SI-NEXT: v_add_f64 v[44:45], s[12:13], 1.0 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_lshr_b64 v[58:59], v[44:45], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[32:33], 16 +; SI-NEXT: v_readfirstlane_b32 s15, v33 +; SI-NEXT: v_readfirstlane_b32 s13, v45 +; SI-NEXT: v_readfirstlane_b32 s7, v36 +; SI-NEXT: v_lshr_b64 v[40:41], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[59:60], v[44:45], 8 +; SI-NEXT: v_lshr_b64 v[45:46], v[32:33], 24 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 24 +; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 8 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_lshr_b64 v[12:13], v[21:22], 24 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[54:55], 8 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_readfirstlane_b32 s57, v25 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v5, v16 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s21, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[7:8], 8 +; SI-NEXT: v_mov_b32_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v27, v17 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v57 +; SI-NEXT: v_readfirstlane_b32 s41, v55 +; SI-NEXT: s_lshr_b32 s10, s5, 24 +; SI-NEXT: s_lshr_b32 s12, s5, 16 +; SI-NEXT: s_lshr_b32 s14, s5, 8 +; SI-NEXT: s_lshr_b32 s16, s7, 24 +; SI-NEXT: s_lshr_b32 s18, s7, 16 +; SI-NEXT: s_lshr_b32 s20, s7, 8 +; SI-NEXT: s_lshr_b32 s22, s9, 24 +; SI-NEXT: s_lshr_b32 s24, s9, 16 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s27, s11, 24 +; SI-NEXT: s_lshr_b32 s28, s11, 16 +; SI-NEXT: s_lshr_b32 s29, s11, 8 +; SI-NEXT: s_lshr_b32 s40, s13, 24 +; SI-NEXT: s_lshr_b32 s42, s13, 16 +; SI-NEXT: s_lshr_b32 s44, s13, 8 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: s_lshr_b32 s56, s15, 16 +; SI-NEXT: s_lshr_b32 s58, s15, 8 +; SI-NEXT: s_lshr_b32 s59, s17, 24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v29, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s60, s17, 16 +; SI-NEXT: s_lshr_b32 s61, s17, 8 +; SI-NEXT: s_lshr_b32 s62, s19, 24 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 8 +; SI-NEXT: s_lshr_b32 s73, s21, 24 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s21, 8 +; SI-NEXT: s_lshr_b32 s76, s23, 24 +; SI-NEXT: s_lshr_b32 s77, s23, 16 +; SI-NEXT: s_lshr_b32 s78, s23, 8 +; SI-NEXT: s_lshr_b32 s79, s25, 24 +; SI-NEXT: s_lshr_b32 s88, s25, 16 +; SI-NEXT: s_lshr_b32 s89, s25, 8 +; SI-NEXT: s_lshr_b32 s90, s41, 24 +; SI-NEXT: s_lshr_b32 s91, s41, 16 +; SI-NEXT: s_lshr_b32 s92, s41, 8 +; SI-NEXT: s_lshr_b32 s93, s43, 24 +; SI-NEXT: s_lshr_b32 s94, s43, 16 +; SI-NEXT: s_lshr_b32 s95, s43, 8 +; SI-NEXT: s_lshr_b32 s30, s45, 24 +; SI-NEXT: s_lshr_b32 s31, s45, 16 +; SI-NEXT: s_lshr_b32 s34, s45, 8 +; SI-NEXT: s_lshr_b32 s35, s47, 24 +; SI-NEXT: s_lshr_b32 s36, s47, 16 +; SI-NEXT: s_lshr_b32 s37, s47, 8 +; SI-NEXT: s_lshr_b32 s8, s57, 24 +; SI-NEXT: s_lshr_b32 vcc_lo, s57, 16 +; SI-NEXT: s_lshr_b32 s6, s57, 8 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_mov_b32_e32 v53, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v15, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v10 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 0 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 1 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 0 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 1 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -117416,7 +117807,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -117430,580 +117821,351 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 2 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 3 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 2 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 3 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 4 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 5 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 4 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 5 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 6 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 7 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 7 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 8 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 9 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 8 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 9 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 10 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 11 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 11 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 12 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 13 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 12 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 13 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 14 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 15 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 14 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 15 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 16 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 17 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 16 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 17 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 18 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 19 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 18 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 19 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 20 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 21 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 20 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 21 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 22 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 23 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 22 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 23 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 24 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 25 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 24 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 25 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 26 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s49, 27 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 27 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 28 -; SI-NEXT: v_writelane_b32 v61, s49, 29 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 28 +; SI-NEXT: v_writelane_b32 v62, s49, 29 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 30 -; SI-NEXT: v_writelane_b32 v61, s49, 31 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 30 +; SI-NEXT: v_writelane_b32 v62, s49, 31 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: v_writelane_b32 v61, s48, 32 -; SI-NEXT: v_writelane_b32 v61, s49, 33 -; SI-NEXT: ; kill: killed $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 32 +; SI-NEXT: v_writelane_b32 v62, s49, 33 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 34 -; SI-NEXT: v_mov_b32_e32 v37, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 35 -; SI-NEXT: v_mov_b32_e32 v51, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 36 -; SI-NEXT: v_mov_b32_e32 v43, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 37 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v57, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 38 -; SI-NEXT: v_mov_b32_e32 v33, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 39 -; SI-NEXT: v_mov_b32_e32 v30, s4 -; SI-NEXT: v_mov_b32_e32 v29, s46 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s98 -; SI-NEXT: v_readlane_b32 s4, v61, 40 -; SI-NEXT: v_mov_b32_e32 v34, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 41 -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 42 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 43 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 44 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 45 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 46 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 47 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 48 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 49 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 50 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 51 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 52 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 53 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 54 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 55 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 56 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 57 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 58 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 59 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 60 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 61 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 62 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 63 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s96 -; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 4 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 5 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 9 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: v_mov_b32_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: v_mov_b32_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: v_mov_b32_e32 v32, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v18, s5 -; SI-NEXT: v_mov_b32_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 0 -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s16 -; SI-NEXT: v_mov_b32_e32 v45, s19 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s20 -; SI-NEXT: v_mov_b32_e32 v39, s23 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s86 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v12, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v5, s40 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, s38 -; SI-NEXT: v_mov_b32_e32 v27, s36 -; SI-NEXT: v_mov_b32_e32 v26, s34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s30 -; SI-NEXT: v_mov_b32_e32 v56, s94 -; SI-NEXT: v_mov_b32_e32 v55, s92 -; SI-NEXT: v_mov_b32_e32 v54, s90 -; SI-NEXT: v_mov_b32_e32 v42, s88 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v40, s76 -; SI-NEXT: v_mov_b32_e32 v50, s74 -; SI-NEXT: v_mov_b32_e32 v49, s72 -; SI-NEXT: v_mov_b32_e32 v48, s62 -; SI-NEXT: v_mov_b32_e32 v47, s60 -; SI-NEXT: v_mov_b32_e32 v36, s58 -; SI-NEXT: v_mov_b32_e32 v35, s56 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s84 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s82 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s80 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s70 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s68 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s66 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s64 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s54 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s52 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s50 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 2 -; SI-NEXT: v_readlane_b32 s5, v61, 3 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 4 -; SI-NEXT: v_readlane_b32 s5, v61, 5 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 6 -; SI-NEXT: v_readlane_b32 s5, v61, 7 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 8 -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 10 -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 12 -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s36 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 0 +; SI-NEXT: v_readlane_b32 s27, v62, 1 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 2 +; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: v_mov_b32_e32 v34, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 4 +; SI-NEXT: v_readlane_b32 s27, v62, 5 +; SI-NEXT: v_mov_b32_e32 v45, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 6 +; SI-NEXT: v_readlane_b32 s27, v62, 7 +; SI-NEXT: v_mov_b32_e32 v59, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 9 +; SI-NEXT: v_mov_b32_e32 v40, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 10 +; SI-NEXT: v_readlane_b32 s27, v62, 11 +; SI-NEXT: v_mov_b32_e32 v58, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 12 +; SI-NEXT: v_readlane_b32 s27, v62, 13 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 14 +; SI-NEXT: v_readlane_b32 s27, v62, 15 +; SI-NEXT: v_mov_b32_e32 v53, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 16 +; SI-NEXT: v_readlane_b32 s27, v62, 17 +; SI-NEXT: v_mov_b32_e32 v50, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 18 +; SI-NEXT: v_readlane_b32 s27, v62, 19 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 20 +; SI-NEXT: v_readlane_b32 s27, v62, 21 +; SI-NEXT: v_mov_b32_e32 v31, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 22 +; SI-NEXT: v_readlane_b32 s27, v62, 23 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 24 +; SI-NEXT: v_readlane_b32 s27, v62, 25 +; SI-NEXT: v_mov_b32_e32 v43, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 26 +; SI-NEXT: v_readlane_b32 s27, v62, 27 +; SI-NEXT: v_mov_b32_e32 v46, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 28 +; SI-NEXT: v_readlane_b32 s27, v62, 29 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 30 +; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: v_mov_b32_e32 v51, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_readlane_b32 s27, v62, 33 +; SI-NEXT: v_mov_b32_e32 v38, s72 +; SI-NEXT: v_mov_b32_e32 v49, s62 +; SI-NEXT: v_mov_b32_e32 v22, s28 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v19, s46 +; SI-NEXT: v_mov_b32_e32 v56, s44 +; SI-NEXT: v_mov_b32_e32 v42, s42 +; SI-NEXT: v_mov_b32_e32 v54, s40 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v30, s22 +; SI-NEXT: v_mov_b32_e32 v21, s20 +; SI-NEXT: v_mov_b32_e32 v48, s18 +; SI-NEXT: v_mov_b32_e32 v36, s16 +; SI-NEXT: v_mov_b32_e32 v32, s14 +; SI-NEXT: v_mov_b32_e32 v44, s12 +; SI-NEXT: v_mov_b32_e32 v47, s10 +; SI-NEXT: v_mov_b32_e32 v35, s6 +; SI-NEXT: v_mov_b32_e32 v37, s4 +; SI-NEXT: v_mov_b32_e32 v52, s96 +; SI-NEXT: v_mov_b32_e32 v28, s86 +; SI-NEXT: v_mov_b32_e32 v14, s84 +; SI-NEXT: v_mov_b32_e32 v23, s82 +; SI-NEXT: v_mov_b32_e32 v27, s80 +; SI-NEXT: v_mov_b32_e32 v25, s70 +; SI-NEXT: v_mov_b32_e32 v26, s68 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 14 -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v5, s66 +; SI-NEXT: v_mov_b32_e32 v12, s54 +; SI-NEXT: v_mov_b32_e32 v33, s50 +; SI-NEXT: v_mov_b32_e32 v8, s48 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_readlane_b32 s10, v62, 34 +; SI-NEXT: v_readlane_b32 s12, v62, 35 +; SI-NEXT: v_readlane_b32 s14, v62, 36 +; SI-NEXT: v_readlane_b32 s16, v62, 37 +; SI-NEXT: v_readlane_b32 s18, v62, 38 +; SI-NEXT: v_readlane_b32 s20, v62, 39 +; SI-NEXT: v_readlane_b32 s22, v62, 40 +; SI-NEXT: v_readlane_b32 s24, v62, 41 +; SI-NEXT: v_readlane_b32 s26, v62, 42 +; SI-NEXT: v_readlane_b32 s27, v62, 43 +; SI-NEXT: v_readlane_b32 s28, v62, 44 +; SI-NEXT: v_readlane_b32 s29, v62, 45 +; SI-NEXT: v_readlane_b32 s40, v62, 46 +; SI-NEXT: v_readlane_b32 s42, v62, 47 +; SI-NEXT: v_readlane_b32 s44, v62, 48 +; SI-NEXT: v_readlane_b32 s46, v62, 49 +; SI-NEXT: v_readlane_b32 s56, v62, 50 +; SI-NEXT: v_readlane_b32 s59, v62, 52 +; SI-NEXT: v_readlane_b32 s61, v62, 54 +; SI-NEXT: v_readlane_b32 s62, v62, 55 +; SI-NEXT: v_readlane_b32 s63, v62, 56 +; SI-NEXT: v_readlane_b32 s72, v62, 57 +; SI-NEXT: v_readlane_b32 s73, v62, 58 +; SI-NEXT: v_readlane_b32 s75, v62, 60 +; SI-NEXT: v_readlane_b32 s77, v62, 62 +; SI-NEXT: v_readlane_b32 s79, v61, 0 +; SI-NEXT: v_readlane_b32 s89, v61, 2 +; SI-NEXT: v_readlane_b32 s91, v61, 4 +; SI-NEXT: v_readlane_b32 s93, v61, 6 +; SI-NEXT: v_readlane_b32 s95, v61, 8 +; SI-NEXT: v_readlane_b32 s31, v61, 10 +; SI-NEXT: v_readlane_b32 s34, v61, 11 +; SI-NEXT: v_readlane_b32 s35, v61, 12 +; SI-NEXT: v_readlane_b32 s36, v61, 13 +; SI-NEXT: v_readlane_b32 s37, v61, 14 +; SI-NEXT: v_readlane_b32 s8, v61, 15 +; SI-NEXT: v_readlane_b32 vcc_lo, v61, 16 +; SI-NEXT: v_readlane_b32 s6, v61, 17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 16 -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s30 +; SI-NEXT: v_readlane_b32 s30, v61, 9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 18 -; SI-NEXT: v_readlane_b32 s5, v61, 19 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s94 +; SI-NEXT: v_readlane_b32 s94, v61, 7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 20 -; SI-NEXT: v_readlane_b32 s5, v61, 21 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s92 +; SI-NEXT: v_readlane_b32 s92, v61, 5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 22 -; SI-NEXT: v_readlane_b32 s5, v61, 23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s90 +; SI-NEXT: v_readlane_b32 s90, v61, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 24 -; SI-NEXT: v_readlane_b32 s5, v61, 25 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s88 +; SI-NEXT: v_readlane_b32 s88, v61, 1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 26 -; SI-NEXT: v_readlane_b32 s5, v61, 27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s78 +; SI-NEXT: v_readlane_b32 s78, v62, 63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 28 -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s76 +; SI-NEXT: v_readlane_b32 s76, v62, 61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 30 -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s74 +; SI-NEXT: v_readlane_b32 s74, v62, 59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: v_readlane_b32 s5, v61, 33 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s60 +; SI-NEXT: v_readlane_b32 s60, v62, 53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s48 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s58 +; SI-NEXT: v_readlane_b32 s58, v62, 51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s98 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s52 +; SI-NEXT: v_mov_b32_e32 v1, s64 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xff, v58 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v46 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v31 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v44 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v55 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v60 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s37, 8 +; SI-NEXT: s_lshl_b32 s8, s35, 24 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -118035,509 +118197,436 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_readlane_b32 s39, v63, 7 ; SI-NEXT: v_readlane_b32 s38, v63, 6 ; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v53 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v40 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v38 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v10 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v39 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s36, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s34, 8 +; SI-NEXT: s_lshl_b32 s8, s30, 24 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v48 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v36 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v18 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s31, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s95, 8 +; SI-NEXT: s_lshl_b32 s8, s93, 24 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v55 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v35 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v38 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s92, 8 +; SI-NEXT: s_lshl_b32 s8, s90, 24 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s91, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s89, 8 +; SI-NEXT: s_lshl_b32 s8, s79, 24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v38 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s78, 8 +; SI-NEXT: s_lshl_b32 s8, s76, 24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v52 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s6, s75, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v27 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s73, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s6, s72, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s62, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s6, s61, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s58, 8 +; SI-NEXT: s_lshl_b32 s8, s46, 24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v17 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v45 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s6, s44, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v40 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s40, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v9 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v53 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s28, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v57 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v50 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v29 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s6, s26, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s22, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s6, s20, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s16, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s12, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s10, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -118553,9 +118642,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -118564,8 +118653,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -118585,40 +118674,68 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_writelane_b32 v63, s55, 15 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_mov_b32_e32 v20, s16 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s56, v20 +; VI-NEXT: v_mov_b32_e32 v20, s17 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s57, v20 +; VI-NEXT: v_mov_b32_e32 v20, s18 ; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_readfirstlane_b32 s46, v20 +; VI-NEXT: v_mov_b32_e32 v20, s19 ; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_readfirstlane_b32 s47, v20 +; VI-NEXT: v_mov_b32_e32 v20, s20 ; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_readfirstlane_b32 s44, v20 +; VI-NEXT: v_mov_b32_e32 v20, s21 ; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_readfirstlane_b32 s45, v20 +; VI-NEXT: v_mov_b32_e32 v20, s22 ; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_readfirstlane_b32 s42, v20 +; VI-NEXT: v_mov_b32_e32 v20, s23 ; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_readfirstlane_b32 s43, v20 +; VI-NEXT: v_mov_b32_e32 v20, s24 ; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s40, v20 +; VI-NEXT: v_mov_b32_e32 v20, s25 ; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_readfirstlane_b32 s41, v20 +; VI-NEXT: v_mov_b32_e32 v20, s26 ; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_readfirstlane_b32 s24, v20 +; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_readfirstlane_b32 s25, v20 +; VI-NEXT: v_mov_b32_e32 v20, s28 ; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s20, v20 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s6, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: v_readfirstlane_b32 s8, v3 -; VI-NEXT: v_readfirstlane_b32 s9, v4 -; VI-NEXT: v_readfirstlane_b32 s10, v5 -; VI-NEXT: v_readfirstlane_b32 s11, v6 -; VI-NEXT: v_readfirstlane_b32 s12, v7 -; VI-NEXT: v_readfirstlane_b32 s13, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s40, v11 -; VI-NEXT: v_readfirstlane_b32 s41, v12 -; VI-NEXT: v_readfirstlane_b32 s42, v13 -; VI-NEXT: v_readfirstlane_b32 s43, v14 -; VI-NEXT: v_readfirstlane_b32 s44, v15 -; VI-NEXT: v_readfirstlane_b32 s45, v16 -; VI-NEXT: v_readfirstlane_b32 s4, v17 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v20 +; VI-NEXT: v_readfirstlane_b32 s22, v1 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s4, v15 +; VI-NEXT: v_readfirstlane_b32 s5, v16 +; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v18 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -118636,387 +118753,326 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB73_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s45, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s43, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s43, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s42, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s40, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: s_lshr_b32 s81, s12, 16 -; VI-NEXT: s_lshr_b32 s80, s12, 8 -; VI-NEXT: s_lshr_b32 s83, s10, 16 -; VI-NEXT: s_lshr_b32 s82, s10, 8 -; VI-NEXT: s_lshr_b32 s85, s8, 16 -; VI-NEXT: s_lshr_b32 s84, s8, 8 -; VI-NEXT: s_lshr_b32 s51, s6, 16 -; VI-NEXT: s_lshr_b32 s50, s6, 8 -; VI-NEXT: s_lshr_b32 s52, s28, 16 -; VI-NEXT: s_lshr_b32 s86, s28, 8 -; VI-NEXT: s_lshr_b32 s87, s26, 16 -; VI-NEXT: s_lshr_b32 s53, s26, 8 -; VI-NEXT: s_lshr_b32 s55, s24, 16 -; VI-NEXT: s_lshr_b32 s54, s24, 8 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 7 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 6 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 27 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 1 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 2 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 0 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s43, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s43, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s45, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s45, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s47, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s47, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s47, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 8 +; VI-NEXT: s_lshr_b32 s83, s18, 16 +; VI-NEXT: s_lshr_b32 s55, s18, 8 ; VI-NEXT: s_lshr_b32 s65, s22, 16 ; VI-NEXT: s_lshr_b32 s64, s22, 8 -; VI-NEXT: s_lshr_b32 s67, s20, 16 -; VI-NEXT: s_lshr_b32 s66, s20, 8 -; VI-NEXT: s_lshr_b32 s69, s18, 16 -; VI-NEXT: s_lshr_b32 s68, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s71, s16, 16 -; VI-NEXT: s_lshr_b32 s70, s16, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s85, s20, 16 +; VI-NEXT: s_lshr_b32 s84, s20, 8 +; VI-NEXT: s_lshr_b32 s67, s24, 16 +; VI-NEXT: s_lshr_b32 s66, s24, 8 +; VI-NEXT: s_lshr_b32 s69, s40, 16 +; VI-NEXT: s_lshr_b32 s68, s40, 8 +; VI-NEXT: s_lshr_b32 s71, s42, 16 +; VI-NEXT: s_lshr_b32 s70, s42, 8 +; VI-NEXT: s_lshr_b32 s87, s44, 16 +; VI-NEXT: s_lshr_b32 s86, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 57 +; VI-NEXT: s_lshr_b32 s81, s46, 16 +; VI-NEXT: s_lshr_b32 s80, s46, 8 +; VI-NEXT: s_lshr_b32 s51, s57, 24 +; VI-NEXT: s_lshr_b32 s52, s57, 16 +; VI-NEXT: s_lshr_b32 s53, s57, 8 +; VI-NEXT: s_lshr_b32 s54, s56, 16 +; VI-NEXT: s_lshr_b32 s50, s56, 8 +; VI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 -; VI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 -; VI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 -; VI-NEXT: v_add_f64 v[15:16], s[8:9], 1.0 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: v_add_f64 v[17:18], s[6:7], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: v_add_f64 v[27:28], s[20:21], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: v_add_f64 v[29:30], s[18:19], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: v_add_f64 v[31:32], s[16:17], 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v6 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v10 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v10 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v9 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v15 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v18 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v18 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v17 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_add_f64 v[1:2], s[6:7], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[23:24], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; VI-NEXT: v_add_f64 v[27:28], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[31:32], s[40:41], 1.0 +; VI-NEXT: v_add_f64 v[33:34], s[42:43], 1.0 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[1:2] +; VI-NEXT: v_add_f64 v[19:20], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[37:38], s[44:45], 1.0 +; VI-NEXT: v_add_f64 v[17:18], s[16:17], 1.0 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_add_f64 v[52:53], s[56:57], 1.0 +; VI-NEXT: v_add_f64 v[50:51], s[46:47], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[27:28] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[31:32] +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_lshrrev_b64 v[29:30], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[19:20] +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[37:38] +; VI-NEXT: v_readfirstlane_b32 s57, v53 +; VI-NEXT: v_readfirstlane_b32 s47, v51 +; VI-NEXT: v_readfirstlane_b32 s45, v38 +; VI-NEXT: v_readfirstlane_b32 s43, v34 +; VI-NEXT: v_readfirstlane_b32 s41, v32 +; VI-NEXT: v_readfirstlane_b32 s25, v28 +; VI-NEXT: v_readfirstlane_b32 s21, v26 +; VI-NEXT: v_readfirstlane_b32 s23, v20 +; VI-NEXT: v_readfirstlane_b32 s19, v24 +; VI-NEXT: v_readfirstlane_b32 s17, v18 +; VI-NEXT: v_readfirstlane_b32 s15, v14 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s5, v4 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[50:51] +; VI-NEXT: v_lshrrev_b64 v[55:56], 24, v[52:53] +; VI-NEXT: s_lshr_b32 s10, s7, 24 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s14, s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; VI-NEXT: s_lshr_b32 s16, s5, 24 +; VI-NEXT: s_lshr_b32 s18, s5, 16 +; VI-NEXT: s_lshr_b32 s20, s5, 8 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; VI-NEXT: s_lshr_b32 s22, s9, 24 +; VI-NEXT: s_lshr_b32 s24, s9, 16 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; VI-NEXT: s_lshr_b32 s27, s11, 24 +; VI-NEXT: s_lshr_b32 s28, s11, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 8 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; VI-NEXT: s_lshr_b32 s40, s13, 24 +; VI-NEXT: s_lshr_b32 s42, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s13, 8 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v11 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: s_lshr_b32 s56, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s15, 8 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v13 +; VI-NEXT: s_lshr_b32 s59, s17, 24 +; VI-NEXT: s_lshr_b32 s60, s17, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 8 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v17 +; VI-NEXT: s_lshr_b32 s62, s19, 24 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v23 +; VI-NEXT: s_lshr_b32 s73, s23, 24 +; VI-NEXT: s_lshr_b32 s74, s23, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 8 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v19 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v21 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v23 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v24 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v26 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v31 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; VI-NEXT: s_lshr_b32 s76, s21, 24 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v25 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v27 +; VI-NEXT: s_lshr_b32 s90, s41, 24 +; VI-NEXT: s_lshr_b32 s91, s41, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 +; VI-NEXT: s_lshr_b32 s31, s43, 24 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s43, 8 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; VI-NEXT: s_lshr_b32 s36, s45, 24 +; VI-NEXT: s_lshr_b32 s37, s45, 16 +; VI-NEXT: s_lshr_b32 s38, s45, 8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v37 +; VI-NEXT: s_lshr_b32 s39, s47, 24 +; VI-NEXT: s_lshr_b32 s48, s47, 16 +; VI-NEXT: s_lshr_b32 s49, s47, 8 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v50 +; VI-NEXT: s_lshr_b32 s51, s57, 24 +; VI-NEXT: s_lshr_b32 s52, s57, 16 +; VI-NEXT: s_lshr_b32 s53, s57, 8 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v52 ; VI-NEXT: s_branch .LBB73_5 ; VI-NEXT: .LBB73_3: -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr87 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr68 ; VI-NEXT: ; implicit-def: $sgpr69 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr82 ; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr82 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr36 @@ -119031,404 +119087,400 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB73_2 ; VI-NEXT: .LBB73_4: -; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s66 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s65 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s64 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s55 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s54 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s87 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s53 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s52 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s51 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s50 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s85 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s84 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 -; VI-NEXT: v_mov_b32_e32 v11, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s81 -; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s80 -; VI-NEXT: v_mov_b32_e32 v34, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: v_mov_b32_e32 v38, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 3 -; VI-NEXT: v_mov_b32_e32 v52, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: v_mov_b32_e32 v37, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: v_mov_b32_e32 v43, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: v_mov_b32_e32 v41, s4 -; VI-NEXT: v_mov_b32_e32 v40, s48 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s38 -; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: v_mov_b32_e32 v44, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v51, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: v_mov_b32_e32 v45, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: v_mov_b32_e32 v47, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s36 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: v_mov_b32_e32 v53, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: v_mov_b32_e32 v56, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: v_mov_b32_e32 v57, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: v_mov_b32_e32 v58, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: v_mov_b32_e32 v48, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: v_mov_b32_e32 v59, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: v_mov_b32_e32 v60, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s30 -; VI-NEXT: v_mov_b32_e32 v49, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: v_mov_b32_e32 v61, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: v_mov_b32_e32 v36, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: v_mov_b32_e32 v12, s5 -; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: v_mov_b32_e32 v2, s45 -; VI-NEXT: v_mov_b32_e32 v3, s42 -; VI-NEXT: v_mov_b32_e32 v4, s43 -; VI-NEXT: v_mov_b32_e32 v5, s40 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v7, s14 -; VI-NEXT: v_mov_b32_e32 v8, s15 -; VI-NEXT: v_mov_b32_e32 v9, s12 -; VI-NEXT: v_mov_b32_e32 v10, s13 -; VI-NEXT: v_mov_b32_e32 v13, s10 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s8 -; VI-NEXT: v_mov_b32_e32 v16, s9 -; VI-NEXT: v_mov_b32_e32 v17, s6 -; VI-NEXT: v_mov_b32_e32 v18, s7 -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s90 -; VI-NEXT: v_mov_b32_e32 v31, s16 -; VI-NEXT: v_mov_b32_e32 v32, s17 -; VI-NEXT: v_mov_b32_e32 v42, s70 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_mov_b32_e32 v46, v38 -; VI-NEXT: v_mov_b32_e32 v38, v34 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s88 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s78 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v40, s76 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s74 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s72 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s62 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s60 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s58 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: v_mov_b32_e32 v39, s58 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s46 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, v43 +; VI-NEXT: v_mov_b32_e32 v39, s28 +; VI-NEXT: v_readlane_b32 s27, v62, 0 +; VI-NEXT: v_mov_b32_e32 v26, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 1 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 2 +; VI-NEXT: v_mov_b32_e32 v24, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 3 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 4 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 5 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 6 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 7 +; VI-NEXT: v_mov_b32_e32 v10, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 8 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 9 +; VI-NEXT: v_mov_b32_e32 v6, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 10 +; VI-NEXT: v_mov_b32_e32 v30, s83 +; VI-NEXT: v_mov_b32_e32 v8, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 11 +; VI-NEXT: v_mov_b32_e32 v29, s78 +; VI-NEXT: v_mov_b32_e32 v2, s27 +; VI-NEXT: v_readlane_b32 s27, v62, 12 +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v15, s54 +; VI-NEXT: v_mov_b32_e32 v21, s50 +; VI-NEXT: v_mov_b32_e32 v61, s81 +; VI-NEXT: v_mov_b32_e32 v9, s80 +; VI-NEXT: v_mov_b32_e32 v59, s87 +; VI-NEXT: v_mov_b32_e32 v60, s86 +; VI-NEXT: v_mov_b32_e32 v57, s71 +; VI-NEXT: v_mov_b32_e32 v58, s70 +; VI-NEXT: v_mov_b32_e32 v47, s69 +; VI-NEXT: v_mov_b32_e32 v56, s68 +; VI-NEXT: v_mov_b32_e32 v51, s67 +; VI-NEXT: v_mov_b32_e32 v53, s66 +; VI-NEXT: v_mov_b32_e32 v38, s85 +; VI-NEXT: v_mov_b32_e32 v49, s84 +; VI-NEXT: v_mov_b32_e32 v34, s65 +; VI-NEXT: v_mov_b32_e32 v36, s64 +; VI-NEXT: v_mov_b32_e32 v32, s55 +; VI-NEXT: v_mov_b32_e32 v28, s82 +; VI-NEXT: v_mov_b32_e32 v4, s27 +; VI-NEXT: v_mov_b32_e32 v52, s56 +; VI-NEXT: v_mov_b32_e32 v50, s46 +; VI-NEXT: v_mov_b32_e32 v37, s44 +; VI-NEXT: v_mov_b32_e32 v33, s42 +; VI-NEXT: v_mov_b32_e32 v31, s40 +; VI-NEXT: v_mov_b32_e32 v27, s24 +; VI-NEXT: v_mov_b32_e32 v25, s20 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_mov_b32_e32 v23, s18 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v55, s48 +; VI-NEXT: v_mov_b32_e32 v46, s38 +; VI-NEXT: v_mov_b32_e32 v45, s36 +; VI-NEXT: v_mov_b32_e32 v44, s34 +; VI-NEXT: v_mov_b32_e32 v43, s30 +; VI-NEXT: v_mov_b32_e32 v42, s90 +; VI-NEXT: v_mov_b32_e32 v41, s88 +; VI-NEXT: v_mov_b32_e32 v54, s74 +; VI-NEXT: v_mov_b32_e32 v48, s72 +; VI-NEXT: v_mov_b32_e32 v35, s62 +; VI-NEXT: v_mov_b32_e32 v29, s60 +; VI-NEXT: v_readlane_b32 s10, v62, 13 +; VI-NEXT: v_readlane_b32 s12, v62, 14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s26 +; VI-NEXT: v_readlane_b32 s14, v62, 15 +; VI-NEXT: v_readlane_b32 s16, v62, 16 +; VI-NEXT: v_readlane_b32 s18, v62, 17 +; VI-NEXT: v_readlane_b32 s20, v62, 18 +; VI-NEXT: v_readlane_b32 s22, v62, 19 +; VI-NEXT: v_readlane_b32 s24, v62, 20 +; VI-NEXT: v_readlane_b32 s26, v62, 21 +; VI-NEXT: v_readlane_b32 s27, v62, 22 +; VI-NEXT: v_readlane_b32 s28, v62, 23 +; VI-NEXT: v_readlane_b32 s29, v62, 24 +; VI-NEXT: v_readlane_b32 s40, v62, 25 +; VI-NEXT: v_readlane_b32 s42, v62, 26 +; VI-NEXT: v_readlane_b32 s44, v62, 27 +; VI-NEXT: v_readlane_b32 s46, v62, 28 +; VI-NEXT: v_readlane_b32 s56, v62, 29 +; VI-NEXT: v_readlane_b32 s58, v62, 30 +; VI-NEXT: v_readlane_b32 s59, v62, 31 +; VI-NEXT: v_readlane_b32 s60, v62, 32 +; VI-NEXT: v_readlane_b32 s61, v62, 33 +; VI-NEXT: v_readlane_b32 s62, v62, 34 +; VI-NEXT: v_readlane_b32 s63, v62, 35 +; VI-NEXT: v_readlane_b32 s72, v62, 36 +; VI-NEXT: v_readlane_b32 s73, v62, 37 +; VI-NEXT: v_readlane_b32 s74, v62, 38 +; VI-NEXT: v_readlane_b32 s75, v62, 39 +; VI-NEXT: v_readlane_b32 s76, v62, 40 +; VI-NEXT: v_readlane_b32 s77, v62, 41 +; VI-NEXT: v_readlane_b32 s78, v62, 42 +; VI-NEXT: v_readlane_b32 s79, v62, 43 +; VI-NEXT: v_readlane_b32 s88, v62, 44 +; VI-NEXT: v_readlane_b32 s89, v62, 45 +; VI-NEXT: v_readlane_b32 s90, v62, 46 +; VI-NEXT: v_readlane_b32 s91, v62, 47 +; VI-NEXT: v_readlane_b32 s30, v62, 48 +; VI-NEXT: v_readlane_b32 s31, v62, 49 +; VI-NEXT: v_readlane_b32 s34, v62, 50 +; VI-NEXT: v_readlane_b32 s35, v62, 51 +; VI-NEXT: v_readlane_b32 s36, v62, 52 +; VI-NEXT: v_readlane_b32 s37, v62, 53 +; VI-NEXT: v_readlane_b32 s38, v62, 54 +; VI-NEXT: v_readlane_b32 s39, v62, 55 +; VI-NEXT: v_readlane_b32 s48, v62, 56 +; VI-NEXT: v_readlane_b32 s49, v62, 57 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: .LBB73_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v50 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s53, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s52, 0xff +; VI-NEXT: s_lshl_b32 s8, s51, 8 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v21, v52, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v55 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: s_and_b32 s4, s47, 0xff +; VI-NEXT: s_lshl_b32 s6, s49, 8 +; VI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v15, vcc, 4, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s48, 0xff +; VI-NEXT: s_lshl_b32 s8, s39, 8 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v46 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v50, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v61, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s45, 0xff +; VI-NEXT: s_lshl_b32 s6, s38, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 12, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s37, 0xff +; VI-NEXT: s_lshl_b32 s8, s36, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v37, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v59, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 16, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s43, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 20, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s34, 0xff +; VI-NEXT: s_lshl_b32 s8, s31, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v44 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v33, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v57, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 24, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s41, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s90, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 36, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s8, s79, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v42 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v27, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v51, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 40, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_lshl_b32 s6, s78, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s77, 0xff +; VI-NEXT: s_lshl_b32 s8, s76, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v41 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 48, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v9, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s75, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s74, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v19, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_lshl_b32 s6, s72, 8 +; VI-NEXT: s_lshl_b32 s8, s62, 8 ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -119461,367 +119513,173 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v50, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v36 -; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v61 -; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v39 -; VI-NEXT: v_or_b32_sdwa v30, v49, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 -; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v60 -; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v48 -; VI-NEXT: v_or_b32_sdwa v28, v59, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 -; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 -; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v58 -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v57 -; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v53 -; VI-NEXT: v_or_b32_sdwa v24, v56, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; VI-NEXT: v_or_b32_sdwa v15, v34, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: v_add_u32_e32 v9, vcc, 60, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s63, 0xff +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v23, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v30, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 64, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_lshl_b32 s6, s61, 8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s60, 0xff +; VI-NEXT: s_lshl_b32 s8, s59, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v54 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x4c, v0 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_b32 s4, s15, 0xff +; VI-NEXT: s_lshl_b32 s6, s58, 8 +; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s56, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v48 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v13, v22, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x54, v0 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: s_and_b32 s4, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s44, 8 +; VI-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v20 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s8, s40, 8 +; VI-NEXT: v_or_b32_sdwa v9, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v35 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x5c, v0 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_b32 s4, s11, 0xff +; VI-NEXT: s_lshl_b32 s6, s29, 8 +; VI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; VI-NEXT: v_or_b32_sdwa v9, v38, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v29 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_or_b32_sdwa v9, v14, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x58, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x64, v0 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: buffer_store_dword v9, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v12 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s9, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s8, s22, 8 ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_lshl_b32 s6, s16, 8 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; VI-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x60, v0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x6c, v0 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s20, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_lshl_b32 s6, s10, 8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x68, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s7, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -119838,8 +119696,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -119848,8 +119706,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -119873,40 +119731,68 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_writelane_b32 v63, s67, 19 ; GFX9-NEXT: v_writelane_b32 v63, s68, 20 ; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 ; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_readfirstlane_b32 s56, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 ; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_readfirstlane_b32 s57, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 ; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_readfirstlane_b32 s46, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 ; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_readfirstlane_b32 s47, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s20 ; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_readfirstlane_b32 s44, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 ; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_readfirstlane_b32 s45, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s22 ; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_readfirstlane_b32 s42, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s23 ; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_readfirstlane_b32 s43, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 ; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_readfirstlane_b32 s40, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s25 ; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_readfirstlane_b32 s41, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s26 ; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_readfirstlane_b32 s24, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_readfirstlane_b32 s25, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 ; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s22, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GFX9-NEXT: v_writelane_b32 v63, s99, 35 -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v2 -; GFX9-NEXT: v_readfirstlane_b32 s8, v3 -; GFX9-NEXT: v_readfirstlane_b32 s9, v4 -; GFX9-NEXT: v_readfirstlane_b32 s10, v5 -; GFX9-NEXT: v_readfirstlane_b32 s11, v6 -; GFX9-NEXT: v_readfirstlane_b32 s12, v7 -; GFX9-NEXT: v_readfirstlane_b32 s13, v8 -; GFX9-NEXT: v_readfirstlane_b32 s14, v9 -; GFX9-NEXT: v_readfirstlane_b32 s15, v10 -; GFX9-NEXT: v_readfirstlane_b32 s40, v11 -; GFX9-NEXT: v_readfirstlane_b32 s41, v12 -; GFX9-NEXT: v_readfirstlane_b32 s42, v13 -; GFX9-NEXT: v_readfirstlane_b32 s43, v14 -; GFX9-NEXT: v_readfirstlane_b32 s44, v15 -; GFX9-NEXT: v_readfirstlane_b32 s45, v16 -; GFX9-NEXT: v_readfirstlane_b32 s4, v17 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_readfirstlane_b32 s23, v20 +; GFX9-NEXT: v_readfirstlane_b32 s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s21, v2 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v15 +; GFX9-NEXT: v_readfirstlane_b32 s5, v16 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -119924,372 +119810,300 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 0 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 1 -; GFX9-NEXT: s_lshr_b32 s46, s45, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s45, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s43, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s43, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s41, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s41, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s15, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s15, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s11, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s27, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s27, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s27, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s25, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s25, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s19, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s19, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s19, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s17, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s17, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s17, 8 -; GFX9-NEXT: s_lshr_b32 s83, s44, 16 -; GFX9-NEXT: s_lshr_b32 s82, s44, 8 -; GFX9-NEXT: s_lshr_b32 s85, s42, 16 -; GFX9-NEXT: s_lshr_b32 s84, s42, 8 -; GFX9-NEXT: s_lshr_b32 s87, s40, 16 -; GFX9-NEXT: s_lshr_b32 s86, s40, 8 -; GFX9-NEXT: s_lshr_b32 s97, s14, 16 -; GFX9-NEXT: s_lshr_b32 s96, s14, 8 -; GFX9-NEXT: s_lshr_b32 s99, s12, 16 -; GFX9-NEXT: s_lshr_b32 s98, s12, 8 -; GFX9-NEXT: s_lshr_b32 s39, s10, 16 -; GFX9-NEXT: s_lshr_b32 s38, s10, 8 -; GFX9-NEXT: s_lshr_b32 s49, s8, 16 -; GFX9-NEXT: s_lshr_b32 s48, s8, 8 -; GFX9-NEXT: s_lshr_b32 s51, s6, 16 -; GFX9-NEXT: s_lshr_b32 s50, s6, 8 -; GFX9-NEXT: s_lshr_b32 s53, s28, 16 -; GFX9-NEXT: s_lshr_b32 s52, s28, 8 -; GFX9-NEXT: s_lshr_b32 s55, s26, 16 -; GFX9-NEXT: s_lshr_b32 s54, s26, 8 -; GFX9-NEXT: s_lshr_b32 s65, s24, 16 -; GFX9-NEXT: s_lshr_b32 s64, s24, 8 -; GFX9-NEXT: s_lshr_b32 s67, s22, 16 -; GFX9-NEXT: s_lshr_b32 s66, s22, 8 -; GFX9-NEXT: s_lshr_b32 s69, s20, 16 -; GFX9-NEXT: s_lshr_b32 s68, s20, 8 -; GFX9-NEXT: s_lshr_b32 s71, s18, 16 -; GFX9-NEXT: s_lshr_b32 s70, s18, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 49 -; GFX9-NEXT: s_lshr_b32 s81, s16, 16 -; GFX9-NEXT: s_lshr_b32 s80, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 1 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s41, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s41, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s41, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s43, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s43, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s43, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s45, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s45, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s45, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s47, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s47, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s47, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s57, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s57, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s57, 8 +; GFX9-NEXT: s_lshr_b32 s83, s4, 16 +; GFX9-NEXT: s_lshr_b32 s82, s4, 8 +; GFX9-NEXT: s_lshr_b32 s85, s8, 16 +; GFX9-NEXT: s_lshr_b32 s84, s8, 8 +; GFX9-NEXT: s_lshr_b32 s87, s10, 16 +; GFX9-NEXT: s_lshr_b32 s86, s10, 8 +; GFX9-NEXT: s_lshr_b32 s97, s12, 16 +; GFX9-NEXT: s_lshr_b32 s96, s12, 8 +; GFX9-NEXT: s_lshr_b32 s99, s14, 16 +; GFX9-NEXT: s_lshr_b32 s98, s14, 8 +; GFX9-NEXT: s_lshr_b32 s39, s16, 16 +; GFX9-NEXT: s_lshr_b32 s38, s16, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s48, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s20, 16 +; GFX9-NEXT: s_lshr_b32 s50, s20, 8 +; GFX9-NEXT: s_lshr_b32 s53, s22, 16 +; GFX9-NEXT: s_lshr_b32 s52, s22, 8 +; GFX9-NEXT: s_lshr_b32 s55, s24, 16 +; GFX9-NEXT: s_lshr_b32 s54, s24, 8 +; GFX9-NEXT: s_lshr_b32 s65, s40, 16 +; GFX9-NEXT: s_lshr_b32 s64, s40, 8 +; GFX9-NEXT: s_lshr_b32 s67, s42, 16 +; GFX9-NEXT: s_lshr_b32 s66, s42, 8 +; GFX9-NEXT: s_lshr_b32 s69, s44, 16 +; GFX9-NEXT: s_lshr_b32 s68, s44, 8 +; GFX9-NEXT: s_lshr_b32 s71, s46, 16 +; GFX9-NEXT: s_lshr_b32 s70, s46, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 49 +; GFX9-NEXT: s_lshr_b32 s81, s56, 16 +; GFX9-NEXT: s_lshr_b32 s80, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB73_4 ; GFX9-NEXT: .LBB73_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 -; GFX9-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[35:36], s[40:41], 1.0 +; GFX9-NEXT: v_add_f64 v[37:38], s[42:43], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[48:49], s[44:45], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[14:15], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] ; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], s[8:9], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: v_add_f64 v[23:24], s[6:7], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_add_f64 v[27:28], s[28:29], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f64 v[52:53], s[46:47], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] -; GFX9-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f64 v[39:40], s[56:57], 1.0 ; GFX9-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] -; GFX9-NEXT: v_add_f64 v[35:36], s[22:23], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: v_add_f64 v[48:49], s[18:19], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[19:20] -; GFX9-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[23:24] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v37 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v48 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v32 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[50:51] -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v23 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v33 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 24, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v38 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v49 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v50 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f64 v[31:32], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[37:38] +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[44:45], 24, v[52:53] +; GFX9-NEXT: v_readfirstlane_b32 s57, v40 +; GFX9-NEXT: v_readfirstlane_b32 s47, v53 +; GFX9-NEXT: v_readfirstlane_b32 s45, v49 +; GFX9-NEXT: v_readfirstlane_b32 s43, v38 +; GFX9-NEXT: v_readfirstlane_b32 s41, v36 +; GFX9-NEXT: v_readfirstlane_b32 s25, v34 +; GFX9-NEXT: v_readfirstlane_b32 s23, v32 +; GFX9-NEXT: v_readfirstlane_b32 s21, v29 +; GFX9-NEXT: v_readfirstlane_b32 s19, v20 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s11, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v6 +; GFX9-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b64 v[29:30], 24, v[28:29] +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[33:34] +; GFX9-NEXT: v_lshrrev_b64 v[45:46], 24, v[39:40] +; GFX9-NEXT: s_lshr_b32 s10, s7, 24 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s14, s7, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: s_lshr_b32 s16, s5, 24 +; GFX9-NEXT: s_lshr_b32 s18, s5, 16 +; GFX9-NEXT: s_lshr_b32 s20, s5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: s_lshr_b32 s22, s9, 24 +; GFX9-NEXT: s_lshr_b32 s24, s9, 16 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX9-NEXT: s_lshr_b32 s27, s11, 24 +; GFX9-NEXT: s_lshr_b32 s28, s11, 16 +; GFX9-NEXT: s_lshr_b32 s29, s11, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX9-NEXT: s_lshr_b32 s40, s13, 24 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v13 +; GFX9-NEXT: s_lshr_b32 s59, s17, 24 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s17, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v15 +; GFX9-NEXT: s_lshr_b32 s62, s19, 24 +; GFX9-NEXT: s_lshr_b32 s63, s19, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; GFX9-NEXT: s_lshr_b32 s73, s21, 24 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; GFX9-NEXT: s_lshr_b32 s76, s23, 24 +; GFX9-NEXT: s_lshr_b32 s77, s23, 16 +; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v31 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; GFX9-NEXT: s_lshr_b32 s90, s41, 24 +; GFX9-NEXT: s_lshr_b32 s91, s41, 16 +; GFX9-NEXT: s_lshr_b32 s92, s41, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v35 +; GFX9-NEXT: s_lshr_b32 s93, s43, 24 +; GFX9-NEXT: s_lshr_b32 s94, s43, 16 +; GFX9-NEXT: s_lshr_b32 s95, s43, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v37 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s45, 24 +; GFX9-NEXT: s_lshr_b32 vcc_hi, s45, 16 +; GFX9-NEXT: s_lshr_b32 s30, s45, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v48 +; GFX9-NEXT: s_lshr_b32 s31, s47, 24 +; GFX9-NEXT: s_lshr_b32 s34, s47, 16 +; GFX9-NEXT: s_lshr_b32 s35, s47, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v52 +; GFX9-NEXT: s_lshr_b32 s8, s57, 24 +; GFX9-NEXT: s_lshr_b32 s36, s57, 16 +; GFX9-NEXT: s_lshr_b32 s6, s57, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v39 ; GFX9-NEXT: s_branch .LBB73_5 ; GFX9-NEXT: .LBB73_3: -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr70 @@ -120334,417 +120148,435 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: -; GFX9-NEXT: v_mov_b32_e32 v41, s66 -; GFX9-NEXT: v_mov_b32_e32 v40, s36 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s34 -; GFX9-NEXT: v_mov_b32_e32 v15, s81 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s71 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s69 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s67 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s65 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s53 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s52 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s50 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s49 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s48 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s39 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s38 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s99 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s98 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s97 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s96 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s87 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s86 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s85 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s84 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s83 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s82 -; GFX9-NEXT: v_readlane_b32 s4, v62, 0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 9 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 16 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 17 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 18 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 19 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 21 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 25 -; GFX9-NEXT: v_mov_b32_e32 v30, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 28 -; GFX9-NEXT: v_mov_b32_e32 v29, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 31 -; GFX9-NEXT: v_mov_b32_e32 v44, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 32 -; GFX9-NEXT: v_mov_b32_e32 v26, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 33 -; GFX9-NEXT: v_mov_b32_e32 v53, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 34 -; GFX9-NEXT: v_mov_b32_e32 v57, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 35 -; GFX9-NEXT: v_mov_b32_e32 v39, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 36 -; GFX9-NEXT: v_mov_b32_e32 v55, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 37 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s94 -; GFX9-NEXT: v_mov_b32_e32 v61, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 38 -; GFX9-NEXT: v_mov_b32_e32 v42, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 39 -; GFX9-NEXT: v_mov_b32_e32 v46, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 40 -; GFX9-NEXT: v_mov_b32_e32 v21, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 41 -; GFX9-NEXT: v_mov_b32_e32 v47, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 42 -; GFX9-NEXT: v_mov_b32_e32 v16, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 43 -; GFX9-NEXT: v_mov_b32_e32 v18, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 44 -; GFX9-NEXT: v_mov_b32_e32 v17, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 45 -; GFX9-NEXT: v_mov_b32_e32 v58, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 46 -; GFX9-NEXT: v_mov_b32_e32 v22, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 47 -; GFX9-NEXT: v_mov_b32_e32 v59, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 48 -; GFX9-NEXT: v_mov_b32_e32 v60, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: v_mov_b32_e32 v12, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s44 -; GFX9-NEXT: v_mov_b32_e32 v2, s45 -; GFX9-NEXT: v_mov_b32_e32 v3, s42 -; GFX9-NEXT: v_mov_b32_e32 v4, s43 -; GFX9-NEXT: v_mov_b32_e32 v5, s40 -; GFX9-NEXT: v_mov_b32_e32 v6, s41 -; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-NEXT: v_mov_b32_e32 v13, s10 -; GFX9-NEXT: v_mov_b32_e32 v14, s11 -; GFX9-NEXT: v_mov_b32_e32 v19, s8 -; GFX9-NEXT: v_mov_b32_e32 v20, s9 -; GFX9-NEXT: v_mov_b32_e32 v23, s6 -; GFX9-NEXT: v_mov_b32_e32 v24, s7 -; GFX9-NEXT: v_mov_b32_e32 v27, s28 -; GFX9-NEXT: v_mov_b32_e32 v28, s29 -; GFX9-NEXT: v_mov_b32_e32 v31, s26 -; GFX9-NEXT: v_mov_b32_e32 v32, s27 +; GFX9-NEXT: v_mov_b32_e32 v22, s62 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v23, s60 +; GFX9-NEXT: v_mov_b32_e32 v22, s58 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_readlane_b32 s27, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s27 +; GFX9-NEXT: v_readlane_b32 s27, v62, 1 +; GFX9-NEXT: v_mov_b32_e32 v60, s81 +; GFX9-NEXT: v_mov_b32_e32 v21, s80 +; GFX9-NEXT: v_mov_b32_e32 v57, s71 +; GFX9-NEXT: v_mov_b32_e32 v17, s70 +; GFX9-NEXT: v_mov_b32_e32 v47, s69 +; GFX9-NEXT: v_mov_b32_e32 v11, s68 +; GFX9-NEXT: v_mov_b32_e32 v46, s67 +; GFX9-NEXT: v_mov_b32_e32 v61, s66 +; GFX9-NEXT: v_mov_b32_e32 v40, s65 +; GFX9-NEXT: v_mov_b32_e32 v59, s64 +; GFX9-NEXT: v_mov_b32_e32 v55, s55 +; GFX9-NEXT: v_mov_b32_e32 v58, s54 +; GFX9-NEXT: v_mov_b32_e32 v53, s53 +; GFX9-NEXT: v_mov_b32_e32 v56, s52 +; GFX9-NEXT: v_mov_b32_e32 v49, s51 +; GFX9-NEXT: v_mov_b32_e32 v51, s50 +; GFX9-NEXT: v_mov_b32_e32 v36, s49 +; GFX9-NEXT: v_mov_b32_e32 v38, s48 +; GFX9-NEXT: v_mov_b32_e32 v32, s39 +; GFX9-NEXT: v_mov_b32_e32 v34, s38 +; GFX9-NEXT: v_mov_b32_e32 v27, s99 +; GFX9-NEXT: v_mov_b32_e32 v30, s98 +; GFX9-NEXT: v_mov_b32_e32 v18, s97 +; GFX9-NEXT: v_mov_b32_e32 v20, s96 +; GFX9-NEXT: v_mov_b32_e32 v14, s87 +; GFX9-NEXT: v_mov_b32_e32 v16, s86 +; GFX9-NEXT: v_mov_b32_e32 v10, s85 +; GFX9-NEXT: v_mov_b32_e32 v12, s84 +; GFX9-NEXT: v_mov_b32_e32 v6, s83 +; GFX9-NEXT: v_mov_b32_e32 v8, s82 +; GFX9-NEXT: v_mov_b32_e32 v4, s27 +; GFX9-NEXT: v_mov_b32_e32 v39, s56 +; GFX9-NEXT: v_mov_b32_e32 v52, s46 +; GFX9-NEXT: v_mov_b32_e32 v48, s44 +; GFX9-NEXT: v_mov_b32_e32 v37, s42 +; GFX9-NEXT: v_mov_b32_e32 v35, s40 ; GFX9-NEXT: v_mov_b32_e32 v33, s24 -; GFX9-NEXT: v_mov_b32_e32 v34, s25 -; GFX9-NEXT: v_mov_b32_e32 v35, s22 -; GFX9-NEXT: v_mov_b32_e32 v36, s23 -; GFX9-NEXT: v_mov_b32_e32 v37, s20 -; GFX9-NEXT: v_mov_b32_e32 v38, s21 -; GFX9-NEXT: v_mov_b32_e32 v48, s18 -; GFX9-NEXT: v_mov_b32_e32 v49, s19 -; GFX9-NEXT: v_mov_b32_e32 v50, s16 -; GFX9-NEXT: v_mov_b32_e32 v51, s17 -; GFX9-NEXT: v_mov_b32_e32 v56, s80 -; GFX9-NEXT: v_mov_b32_e32 v45, s70 -; GFX9-NEXT: v_mov_b32_e32 v43, s68 -; GFX9-NEXT: v_mov_b32_e32 v54, s64 -; GFX9-NEXT: v_mov_b32_e32 v52, s54 -; GFX9-NEXT: v_mov_b32_e32 v25, s4 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s92 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s90 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s88 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s78 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s76 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s74 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s72 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s62 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s60 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s58 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s46 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, s22 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v45, s36 +; GFX9-NEXT: v_mov_b32_e32 v44, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s30 +; GFX9-NEXT: v_mov_b32_e32 v42, s94 +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: v_mov_b32_e32 v54, s90 +; GFX9-NEXT: v_mov_b32_e32 v50, s88 +; GFX9-NEXT: v_mov_b32_e32 v29, s78 +; GFX9-NEXT: v_mov_b32_e32 v26, s76 +; GFX9-NEXT: v_mov_b32_e32 v25, s74 +; GFX9-NEXT: v_mov_b32_e32 v24, s72 +; GFX9-NEXT: v_readlane_b32 s10, v62, 2 +; GFX9-NEXT: v_readlane_b32 s12, v62, 3 +; GFX9-NEXT: v_readlane_b32 s14, v62, 4 +; GFX9-NEXT: v_readlane_b32 s16, v62, 5 +; GFX9-NEXT: v_readlane_b32 s18, v62, 6 +; GFX9-NEXT: v_readlane_b32 s20, v62, 7 +; GFX9-NEXT: v_readlane_b32 s22, v62, 8 +; GFX9-NEXT: v_readlane_b32 s24, v62, 9 +; GFX9-NEXT: v_readlane_b32 s27, v62, 11 +; GFX9-NEXT: v_readlane_b32 s28, v62, 12 +; GFX9-NEXT: v_readlane_b32 s29, v62, 13 +; GFX9-NEXT: v_readlane_b32 s40, v62, 14 +; GFX9-NEXT: v_readlane_b32 s42, v62, 15 +; GFX9-NEXT: v_readlane_b32 s44, v62, 16 +; GFX9-NEXT: v_readlane_b32 s46, v62, 17 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_readlane_b32 s26, v62, 10 +; GFX9-NEXT: v_readlane_b32 s56, v62, 18 +; GFX9-NEXT: v_readlane_b32 s58, v62, 19 +; GFX9-NEXT: v_readlane_b32 s59, v62, 20 +; GFX9-NEXT: v_readlane_b32 s60, v62, 21 +; GFX9-NEXT: v_readlane_b32 s61, v62, 22 +; GFX9-NEXT: v_readlane_b32 s62, v62, 23 +; GFX9-NEXT: v_readlane_b32 s63, v62, 24 +; GFX9-NEXT: v_readlane_b32 s72, v62, 25 +; GFX9-NEXT: v_readlane_b32 s73, v62, 26 +; GFX9-NEXT: v_readlane_b32 s74, v62, 27 +; GFX9-NEXT: v_readlane_b32 s75, v62, 28 +; GFX9-NEXT: v_readlane_b32 s76, v62, 29 +; GFX9-NEXT: v_readlane_b32 s77, v62, 30 +; GFX9-NEXT: v_readlane_b32 s78, v62, 31 +; GFX9-NEXT: v_readlane_b32 s79, v62, 32 +; GFX9-NEXT: v_readlane_b32 s88, v62, 33 +; GFX9-NEXT: v_readlane_b32 s89, v62, 34 +; GFX9-NEXT: v_readlane_b32 s90, v62, 35 +; GFX9-NEXT: v_readlane_b32 s91, v62, 36 +; GFX9-NEXT: v_readlane_b32 s92, v62, 37 +; GFX9-NEXT: v_readlane_b32 s93, v62, 38 +; GFX9-NEXT: v_readlane_b32 s94, v62, 39 +; GFX9-NEXT: v_readlane_b32 s95, v62, 40 +; GFX9-NEXT: v_readlane_b32 vcc_lo, v62, 41 +; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 42 +; GFX9-NEXT: v_readlane_b32 s30, v62, 43 +; GFX9-NEXT: v_readlane_b32 s31, v62, 44 +; GFX9-NEXT: v_readlane_b32 s34, v62, 45 +; GFX9-NEXT: v_readlane_b32 s35, v62, 46 +; GFX9-NEXT: v_readlane_b32 s8, v62, 47 +; GFX9-NEXT: v_readlane_b32 s36, v62, 48 +; GFX9-NEXT: v_readlane_b32 s6, v62, 49 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: .LBB73_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v33, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v34, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v37, v37, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v58 +; GFX9-NEXT: s_and_b32 s4, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: v_or_b32_sdwa v33, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v56 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v45 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v39, v60, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v21, s4 +; GFX9-NEXT: s_and_b32 s4, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s35, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s31, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v58, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v35, v35, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v44 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v57, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: s_and_b32 s4, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s30, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, vcc_hi, 0xff +; GFX9-NEXT: s_lshl_b32 s8, vcc_lo, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v48, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v47, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s93, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v42 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v46, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v37, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s90, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v35, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s89, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s79, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s78, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s76, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v50 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v53, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v31, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s75, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s74, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s72, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s63, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s62, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v19, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s61, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v34 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s59, 8 +; GFX9-NEXT: v_or_b32_sdwa v11, v15, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_b32 s4, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s58, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s56, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 +; GFX9-NEXT: v_or_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v13, v27, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v20 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s44, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s40, 8 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -120781,326 +120613,87 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v37, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v16, v46, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v16, v55, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v31, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v20, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: s_and_b32 s4, s11, 0xff +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s28, 0xff ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v9, v14, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v12 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s22, 8 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s6, s16, 8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s20, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_lshl_b32 s5, s14, 8 +; GFX9-NEXT: s_lshl_b32 s6, s10, 8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s12, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -121117,8 +120710,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -121128,408 +120721,431 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v76, s30, 0 -; GFX11-NEXT: v_writelane_b32 v77, s96, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v41, s97, 1 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-NEXT: v_writelane_b32 v41, s98, 2 +; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21 +; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23 +; GFX11-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-NEXT: v_writelane_b32 v41, s99, 3 +; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25 +; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27 +; GFX11-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-NEXT: v_writelane_b32 v41, s100, 4 +; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_readfirstlane_b32 s5, v2 -; GFX11-NEXT: v_writelane_b32 v76, s31, 1 -; GFX11-NEXT: v_writelane_b32 v77, s97, 1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 -; GFX11-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-NEXT: v_writelane_b32 v41, s101, 5 +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-NEXT: v_writelane_b32 v41, s102, 6 +; GFX11-NEXT: v_readfirstlane_b32 s29, v19 +; GFX11-NEXT: v_readfirstlane_b32 s26, v20 +; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-NEXT: v_writelane_b32 v41, s103, 7 +; GFX11-NEXT: v_readfirstlane_b32 s24, v22 +; GFX11-NEXT: v_readfirstlane_b32 s25, v23 +; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s20, v26 +; GFX11-NEXT: v_readfirstlane_b32 s21, v27 +; GFX11-NEXT: v_readfirstlane_b32 s18, v28 +; GFX11-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-NEXT: v_readfirstlane_b32 s19, v29 +; GFX11-NEXT: v_readfirstlane_b32 s16, v30 +; GFX11-NEXT: v_readfirstlane_b32 s17, v31 +; GFX11-NEXT: v_readfirstlane_b32 s14, v32 +; GFX11-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-NEXT: v_readfirstlane_b32 s15, v33 +; GFX11-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v4 ; GFX11-NEXT: v_readfirstlane_b32 s8, v5 -; GFX11-NEXT: v_writelane_b32 v76, s34, 2 -; GFX11-NEXT: v_writelane_b32 v77, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s9, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 -; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v76, s35, 3 -; GFX11-NEXT: v_writelane_b32 v77, s99, 3 -; GFX11-NEXT: v_readfirstlane_b32 s12, v9 -; GFX11-NEXT: v_readfirstlane_b32 s13, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: v_writelane_b32 v76, s36, 4 -; GFX11-NEXT: v_writelane_b32 v77, s100, 4 -; GFX11-NEXT: v_readfirstlane_b32 s15, v12 -; GFX11-NEXT: v_readfirstlane_b32 s40, v13 -; GFX11-NEXT: v_readfirstlane_b32 s41, v14 -; GFX11-NEXT: v_writelane_b32 v76, s37, 5 -; GFX11-NEXT: v_writelane_b32 v77, s101, 5 -; GFX11-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 +; GFX11-NEXT: s_mov_b32 s49, 0 +; GFX11-NEXT: v_writelane_b32 v40, s54, 14 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 -; GFX11-NEXT: v_writelane_b32 v76, s38, 6 -; GFX11-NEXT: v_writelane_b32 v77, s102, 6 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr79 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v76, s39, 7 -; GFX11-NEXT: v_writelane_b32 v77, s103, 7 -; GFX11-NEXT: v_writelane_b32 v76, s48, 8 -; GFX11-NEXT: v_writelane_b32 v77, s104, 8 -; GFX11-NEXT: v_writelane_b32 v76, s49, 9 -; GFX11-NEXT: v_writelane_b32 v76, s50, 10 -; GFX11-NEXT: v_writelane_b32 v76, s51, 11 -; GFX11-NEXT: v_writelane_b32 v76, s52, 12 -; GFX11-NEXT: v_writelane_b32 v76, s53, 13 -; GFX11-NEXT: v_writelane_b32 v76, s54, 14 -; GFX11-NEXT: v_writelane_b32 v76, s55, 15 -; GFX11-NEXT: v_writelane_b32 v76, s64, 16 -; GFX11-NEXT: v_writelane_b32 v76, s65, 17 -; GFX11-NEXT: v_writelane_b32 v76, s66, 18 -; GFX11-NEXT: v_writelane_b32 v76, s67, 19 -; GFX11-NEXT: v_writelane_b32 v76, s68, 20 -; GFX11-NEXT: v_writelane_b32 v76, s69, 21 -; GFX11-NEXT: v_writelane_b32 v76, s70, 22 -; GFX11-NEXT: v_writelane_b32 v76, s71, 23 -; GFX11-NEXT: v_writelane_b32 v76, s80, 24 -; GFX11-NEXT: v_writelane_b32 v76, s81, 25 -; GFX11-NEXT: v_writelane_b32 v76, s82, 26 -; GFX11-NEXT: v_writelane_b32 v76, s83, 27 -; GFX11-NEXT: v_writelane_b32 v76, s84, 28 -; GFX11-NEXT: v_writelane_b32 v76, s85, 29 -; GFX11-NEXT: v_writelane_b32 v76, s86, 30 -; GFX11-NEXT: v_writelane_b32 v76, s87, 31 +; GFX11-NEXT: v_writelane_b32 v41, s104, 8 +; GFX11-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-NEXT: v_writelane_b32 v40, s64, 16 +; GFX11-NEXT: v_writelane_b32 v40, s65, 17 +; GFX11-NEXT: v_writelane_b32 v40, s66, 18 +; GFX11-NEXT: v_writelane_b32 v40, s67, 19 +; GFX11-NEXT: v_writelane_b32 v40, s68, 20 +; GFX11-NEXT: v_writelane_b32 v40, s69, 21 +; GFX11-NEXT: v_writelane_b32 v40, s70, 22 +; GFX11-NEXT: v_writelane_b32 v40, s71, 23 +; GFX11-NEXT: v_writelane_b32 v40, s80, 24 +; GFX11-NEXT: v_writelane_b32 v40, s81, 25 +; GFX11-NEXT: v_writelane_b32 v40, s82, 26 +; GFX11-NEXT: v_writelane_b32 v40, s83, 27 +; GFX11-NEXT: v_writelane_b32 v40, s84, 28 +; GFX11-NEXT: v_writelane_b32 v40, s85, 29 +; GFX11-NEXT: v_writelane_b32 v40, s86, 30 +; GFX11-NEXT: v_writelane_b32 v40, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s50, s41, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s13, 8 -; GFX11-NEXT: s_lshr_b32 s49, s41, 16 +; GFX11-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-NEXT: s_lshr_b32 s53, s2, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-NEXT: s_lshr_b32 s52, s2, 8 +; GFX11-NEXT: s_lshr_b32 s55, s4, 16 +; GFX11-NEXT: s_lshr_b32 s54, s4, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-NEXT: s_lshr_b32 s65, s6, 16 +; GFX11-NEXT: s_lshr_b32 s64, s6, 8 +; GFX11-NEXT: s_lshr_b32 s67, s8, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-NEXT: s_lshr_b32 s66, s8, 8 +; GFX11-NEXT: s_lshr_b32 s69, s10, 16 +; GFX11-NEXT: s_lshr_b32 s68, s10, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-NEXT: s_lshr_b32 s71, s12, 16 +; GFX11-NEXT: s_lshr_b32 s70, s12, 8 +; GFX11-NEXT: s_lshr_b32 s81, s14, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-NEXT: s_lshr_b32 s80, s14, 8 +; GFX11-NEXT: s_lshr_b32 s83, s16, 16 +; GFX11-NEXT: s_lshr_b32 s82, s16, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-NEXT: s_lshr_b32 s85, s18, 16 +; GFX11-NEXT: s_lshr_b32 s84, s18, 8 +; GFX11-NEXT: s_lshr_b32 s87, s20, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: s_lshr_b32 s86, s20, 8 +; GFX11-NEXT: s_lshr_b32 s97, s22, 16 +; GFX11-NEXT: s_lshr_b32 s96, s22, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b32 s99, s24, 16 +; GFX11-NEXT: s_lshr_b32 s98, s24, 8 +; GFX11-NEXT: s_lshr_b32 s104, s27, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 16 +; GFX11-NEXT: s_lshr_b32 s34, s27, 8 +; GFX11-NEXT: s_lshr_b32 s101, s26, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s100, s26, 8 +; GFX11-NEXT: s_lshr_b32 s35, s29, 24 +; GFX11-NEXT: s_lshr_b32 s36, s29, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s37, s29, 8 +; GFX11-NEXT: s_lshr_b32 s50, s28, 16 +; GFX11-NEXT: s_lshr_b32 s102, s28, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s38, s41, 24 +; GFX11-NEXT: s_lshr_b32 s39, s41, 16 ; GFX11-NEXT: s_lshr_b32 s48, s41, 8 -; GFX11-NEXT: s_lshr_b32 s52, s40, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s103, s40, 16 ; GFX11-NEXT: s_lshr_b32 s51, s40, 8 -; GFX11-NEXT: s_lshr_b32 s39, s15, 24 -; GFX11-NEXT: s_lshr_b32 s38, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s11, 16 -; GFX11-NEXT: s_lshr_b32 s37, s15, 8 -; GFX11-NEXT: s_lshr_b32 s54, s14, 16 -; GFX11-NEXT: s_lshr_b32 s53, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s11, 8 -; GFX11-NEXT: s_lshr_b32 s36, s13, 24 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: s_lshr_b32 s55, s12, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s9, 24 -; GFX11-NEXT: s_lshr_b32 s66, s10, 16 -; GFX11-NEXT: s_lshr_b32 s65, s10, 8 -; GFX11-NEXT: s_lshr_b32 s68, s8, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s67, s8, 8 -; GFX11-NEXT: s_lshr_b32 s70, s6, 16 -; GFX11-NEXT: s_lshr_b32 s69, s6, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s9, 8 -; GFX11-NEXT: s_lshr_b32 s80, s4, 16 -; GFX11-NEXT: s_lshr_b32 s71, s4, 8 -; GFX11-NEXT: s_lshr_b32 s82, s28, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s7, 24 -; GFX11-NEXT: s_lshr_b32 s81, s28, 8 -; GFX11-NEXT: s_lshr_b32 s84, s26, 16 -; GFX11-NEXT: s_lshr_b32 s83, s26, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s7, 16 -; GFX11-NEXT: s_lshr_b32 s86, s24, 16 -; GFX11-NEXT: s_lshr_b32 s85, s24, 8 -; GFX11-NEXT: s_lshr_b32 s96, s22, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 -; GFX11-NEXT: s_lshr_b32 s42, s7, 8 -; GFX11-NEXT: s_lshr_b32 s87, s22, 8 -; GFX11-NEXT: s_lshr_b32 s98, s20, 16 -; GFX11-NEXT: s_lshr_b32 s97, s20, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 -; GFX11-NEXT: s_lshr_b32 s42, s5, 24 -; GFX11-NEXT: s_lshr_b32 s100, s18, 16 -; GFX11-NEXT: s_lshr_b32 s99, s18, 8 -; GFX11-NEXT: s_lshr_b32 s102, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s101, s16, 8 -; GFX11-NEXT: s_lshr_b32 s104, s2, 16 -; GFX11-NEXT: s_lshr_b32 s103, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 -; GFX11-NEXT: s_lshr_b32 s42, s5, 8 -; GFX11-NEXT: s_lshr_b32 s35, s0, 16 -; GFX11-NEXT: s_lshr_b32 s34, s0, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 -; GFX11-NEXT: s_lshr_b32 s42, s29, 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 -; GFX11-NEXT: s_lshr_b32 s42, s29, 16 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 -; GFX11-NEXT: s_lshr_b32 s42, s29, 8 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 24 -; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s25, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s17, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: v_writelane_b32 v42, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v42, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v42, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v79, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-NEXT: v_writelane_b32 v43, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v79, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-NEXT: v_writelane_b32 v43, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-NEXT: v_writelane_b32 v43, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v79, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: v_writelane_b32 v43, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: v_writelane_b32 v43, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s1, 24 -; GFX11-NEXT: v_writelane_b32 v79, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-NEXT: v_writelane_b32 v43, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-NEXT: v_writelane_b32 v43, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v79, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s1, 8 -; GFX11-NEXT: v_writelane_b32 v79, s42, 8 -; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-NEXT: v_writelane_b32 v43, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-NEXT: v_writelane_b32 v43, s42, 8 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s49 ; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 ; GFX11-NEXT: .LBB73_2: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[32:33], s[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[5:6], s[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[52:53], s[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[1:2], s[40:41], 1.0 -; GFX11-NEXT: v_add_f64 v[3:4], s[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[9:10], s[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[50:51], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[52:53], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[68:69], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[64:65], s[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[48:49], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[35:36], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[31:32], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[29:30], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[25:26], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[21:22], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[17:18], s[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[11:12], s[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[13:14], s[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[64:65], s[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[23:24] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] -; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] -; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] -; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[3:4], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[1:2], s[0:1], 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[52:53] +; GFX11-NEXT: v_readfirstlane_b32 s41, v69 +; GFX11-NEXT: v_readfirstlane_b32 s29, v65 +; GFX11-NEXT: v_readfirstlane_b32 s27, v53 +; GFX11-NEXT: v_readfirstlane_b32 s25, v51 +; GFX11-NEXT: v_readfirstlane_b32 s23, v49 +; GFX11-NEXT: v_readfirstlane_b32 s21, v36 +; GFX11-NEXT: v_readfirstlane_b32 s19, v32 +; GFX11-NEXT: v_readfirstlane_b32 s17, v30 +; GFX11-NEXT: v_readfirstlane_b32 s15, v26 +; GFX11-NEXT: v_readfirstlane_b32 s13, v22 +; GFX11-NEXT: v_readfirstlane_b32 s11, v18 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[64:65] +; GFX11-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[35:36] ; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[64:65] -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v65 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v64 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v64 +; GFX11-NEXT: v_lshrrev_b64 v[83:84], 24, v[68:69] +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 8, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v68 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; GFX11-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-NEXT: s_lshr_b32 s14, s3, 24 +; GFX11-NEXT: s_lshr_b32 s16, s3, 16 +; GFX11-NEXT: s_lshr_b32 s18, s3, 8 +; GFX11-NEXT: s_lshr_b32 s20, s5, 24 +; GFX11-NEXT: s_lshr_b32 s22, s5, 16 +; GFX11-NEXT: s_lshr_b32 s24, s5, 8 +; GFX11-NEXT: s_lshr_b32 s26, s7, 24 +; GFX11-NEXT: s_lshr_b32 s28, s7, 16 +; GFX11-NEXT: s_lshr_b32 s40, s7, 8 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 8 +; GFX11-NEXT: s_lshr_b32 s45, s11, 24 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s11, 8 +; GFX11-NEXT: s_lshr_b32 s56, s13, 24 +; GFX11-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-NEXT: s_lshr_b32 s58, s13, 8 +; GFX11-NEXT: s_lshr_b32 s59, s15, 24 +; GFX11-NEXT: s_lshr_b32 s60, s15, 16 +; GFX11-NEXT: s_lshr_b32 s61, s15, 8 +; GFX11-NEXT: s_lshr_b32 s62, s17, 24 +; GFX11-NEXT: s_lshr_b32 s63, s17, 16 +; GFX11-NEXT: s_lshr_b32 s72, s17, 8 +; GFX11-NEXT: s_lshr_b32 s73, s19, 24 +; GFX11-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-NEXT: s_lshr_b32 s75, s19, 8 +; GFX11-NEXT: s_lshr_b32 s76, s21, 24 +; GFX11-NEXT: s_lshr_b32 s77, s21, 16 +; GFX11-NEXT: s_lshr_b32 s78, s21, 8 +; GFX11-NEXT: s_lshr_b32 s79, s23, 24 +; GFX11-NEXT: s_lshr_b32 s88, s23, 16 +; GFX11-NEXT: s_lshr_b32 s89, s23, 8 +; GFX11-NEXT: s_lshr_b32 s90, s25, 24 +; GFX11-NEXT: s_lshr_b32 s91, s25, 16 +; GFX11-NEXT: s_lshr_b32 s92, s25, 8 +; GFX11-NEXT: s_lshr_b32 s104, s27, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 16 +; GFX11-NEXT: s_lshr_b32 s34, s27, 8 +; GFX11-NEXT: s_lshr_b32 s35, s29, 24 +; GFX11-NEXT: s_lshr_b32 s36, s29, 16 +; GFX11-NEXT: s_lshr_b32 s37, s29, 8 +; GFX11-NEXT: s_lshr_b32 s38, s41, 24 +; GFX11-NEXT: s_lshr_b32 s39, s41, 16 +; GFX11-NEXT: s_lshr_b32 s48, s41, 8 ; GFX11-NEXT: s_branch .LBB73_5 ; GFX11-NEXT: .LBB73_3: ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 +; GFX11-NEXT: s_mov_b32 s49, -1 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr98 ; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr98 ; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr85 ; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr81 ; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr67 ; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr65 ; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr92 @@ -121620,524 +121236,441 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: s_branch .LBB73_2 ; GFX11-NEXT: .LBB73_4: -; GFX11-NEXT: v_dual_mov_b32 v64, s0 :: v_dual_mov_b32 v65, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 -; GFX11-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v4, s15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v87, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 1 -; GFX11-NEXT: v_mov_b32_e32 v39, s54 -; GFX11-NEXT: v_dual_mov_b32 v5, s12 :: v_dual_mov_b32 v6, s13 -; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 -; GFX11-NEXT: v_mov_b32_e32 v96, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 -; GFX11-NEXT: v_dual_mov_b32 v9, s8 :: v_dual_mov_b32 v10, s9 -; GFX11-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v12, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v99, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 -; GFX11-NEXT: v_mov_b32_e32 v55, s53 -; GFX11-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s5 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_mov_b32_e32 v100, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 -; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v101, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 -; GFX11-NEXT: v_dual_mov_b32 v27, s51 :: v_dual_mov_b32 v28, s22 -; GFX11-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v32, s20 -; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v112, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v31, s49 :: v_dual_mov_b32 v36, s18 -; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v48, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v114, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s48 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v44, s35 -; GFX11-NEXT: v_dual_mov_b32 v41, s104 :: v_dual_mov_b32 v116, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 -; GFX11-NEXT: v_dual_mov_b32 v46, s34 :: v_dual_mov_b32 v43, s103 -; GFX11-NEXT: v_dual_mov_b32 v181, s102 :: v_dual_mov_b32 v182, s101 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v119, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v51, s39 :: v_dual_mov_b32 v176, s100 -; GFX11-NEXT: v_mov_b32_e32 v177, s99 -; GFX11-NEXT: v_dual_mov_b32 v163, s98 :: v_dual_mov_b32 v160, s96 -; GFX11-NEXT: v_mov_b32_e32 v128, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v165, s97 :: v_dual_mov_b32 v148, s86 -; GFX11-NEXT: v_dual_mov_b32 v161, s87 :: v_dual_mov_b32 v144, s83 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v129, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_mov_b32_e32 v71, s38 -; GFX11-NEXT: v_dual_mov_b32 v149, s85 :: v_dual_mov_b32 v130, s82 -; GFX11-NEXT: v_dual_mov_b32 v135, s84 :: v_dual_mov_b32 v118, s71 -; GFX11-NEXT: v_mov_b32_e32 v132, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v131, s81 :: v_dual_mov_b32 v102, s68 -; GFX11-NEXT: v_dual_mov_b32 v117, s80 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v133, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_mov_b32_e32 v83, s37 -; GFX11-NEXT: v_dual_mov_b32 v113, s70 :: v_dual_mov_b32 v84, s64 -; GFX11-NEXT: v_dual_mov_b32 v115, s69 :: v_dual_mov_b32 v86, s55 -; GFX11-NEXT: v_mov_b32_e32 v134, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v103, s67 :: v_dual_mov_b32 v18, s52 -; GFX11-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v22, s50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v145, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_mov_b32_e32 v85, s36 -; GFX11-NEXT: v_dual_mov_b32 v81, s42 :: v_dual_mov_b32 v38, s90 -; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s88 -; GFX11-NEXT: v_mov_b32_e32 v146, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s78 -; GFX11-NEXT: v_dual_mov_b32 v26, s76 :: v_dual_mov_b32 v25, s74 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v147, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_mov_b32_e32 v21, s72 -; GFX11-NEXT: v_dual_mov_b32 v17, s62 :: v_dual_mov_b32 v80, s44 -; GFX11-NEXT: v_mov_b32_e32 v70, s46 -; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_mov_b32_e32 v68, s58 -; GFX11-NEXT: v_mov_b32_e32 v66, s30 -; GFX11-NEXT: v_mov_b32_e32 v54, s94 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v50, s92 :: v_dual_mov_b32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 -; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 -; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 -; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 -; GFX11-NEXT: v_mov_b32_e32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 1 -; GFX11-NEXT: v_mov_b32_e32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 3 -; GFX11-NEXT: v_mov_b32_e32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 5 -; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 7 -; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v79, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-NEXT: v_readlane_b32 s43, v42, 0 +; GFX11-NEXT: v_dual_mov_b32 v98, s103 :: v_dual_mov_b32 v99, s51 +; GFX11-NEXT: v_dual_mov_b32 v96, s50 :: v_dual_mov_b32 v97, s102 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v2, s43 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: v_readlane_b32 s43, v42, 1 +; GFX11-NEXT: v_dual_mov_b32 v86, s101 :: v_dual_mov_b32 v87, s100 +; GFX11-NEXT: v_dual_mov_b32 v71, s99 :: v_dual_mov_b32 v34, s83 +; GFX11-NEXT: v_dual_mov_b32 v85, s98 :: v_dual_mov_b32 v36, s82 +; GFX11-NEXT: v_dual_mov_b32 v55, s97 :: v_dual_mov_b32 v30, s81 +; GFX11-NEXT: v_dual_mov_b32 v67, s96 :: v_dual_mov_b32 v32, s80 +; GFX11-NEXT: v_dual_mov_b32 v51, s87 :: v_dual_mov_b32 v26, s71 +; GFX11-NEXT: v_dual_mov_b32 v53, s86 :: v_dual_mov_b32 v28, s70 +; GFX11-NEXT: v_dual_mov_b32 v39, s85 :: v_dual_mov_b32 v22, s69 +; GFX11-NEXT: v_dual_mov_b32 v49, s84 :: v_dual_mov_b32 v24, s68 +; GFX11-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v35, s20 +; GFX11-NEXT: v_dual_mov_b32 v20, s66 :: v_dual_mov_b32 v31, s18 +; GFX11-NEXT: v_dual_mov_b32 v14, s65 :: v_dual_mov_b32 v29, s16 +; GFX11-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v25, s14 +; GFX11-NEXT: v_dual_mov_b32 v10, s55 :: v_dual_mov_b32 v21, s12 +; GFX11-NEXT: v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v17, s10 +; GFX11-NEXT: v_dual_mov_b32 v6, s53 :: v_dual_mov_b32 v13, s8 +; GFX11-NEXT: v_dual_mov_b32 v8, s52 :: v_dual_mov_b32 v11, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s43 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v68, s40 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s28 :: v_dual_mov_b32 v83, s42 +; GFX11-NEXT: v_dual_mov_b32 v52, s26 :: v_dual_mov_b32 v81, s46 +; GFX11-NEXT: v_dual_mov_b32 v50, s24 :: v_dual_mov_b32 v37, s92 +; GFX11-NEXT: v_dual_mov_b32 v48, s22 :: v_dual_mov_b32 v33, s90 +; GFX11-NEXT: v_dual_mov_b32 v82, s44 :: v_dual_mov_b32 v27, s88 +; GFX11-NEXT: v_dual_mov_b32 v80, s56 :: v_dual_mov_b32 v23, s78 +; GFX11-NEXT: v_dual_mov_b32 v70, s58 :: v_dual_mov_b32 v19, s76 +; GFX11-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v15, s74 +; GFX11-NEXT: v_dual_mov_b32 v54, s30 :: v_dual_mov_b32 v9, s72 +; GFX11-NEXT: v_dual_mov_b32 v38, s94 :: v_dual_mov_b32 v5, s62 +; GFX11-NEXT: v_readlane_b32 s8, v42, 2 +; GFX11-NEXT: v_readlane_b32 s10, v42, 3 +; GFX11-NEXT: v_readlane_b32 s12, v42, 4 +; GFX11-NEXT: v_readlane_b32 s14, v42, 5 +; GFX11-NEXT: v_readlane_b32 s16, v42, 6 +; GFX11-NEXT: v_readlane_b32 s18, v42, 7 +; GFX11-NEXT: v_readlane_b32 s20, v42, 8 +; GFX11-NEXT: v_readlane_b32 s22, v42, 9 +; GFX11-NEXT: v_readlane_b32 s24, v42, 10 +; GFX11-NEXT: v_readlane_b32 s26, v42, 11 +; GFX11-NEXT: v_readlane_b32 s28, v42, 12 +; GFX11-NEXT: v_readlane_b32 s40, v42, 13 +; GFX11-NEXT: v_readlane_b32 s42, v42, 14 +; GFX11-NEXT: v_readlane_b32 s43, v42, 15 +; GFX11-NEXT: v_readlane_b32 s44, v42, 16 +; GFX11-NEXT: v_readlane_b32 s45, v42, 17 +; GFX11-NEXT: v_readlane_b32 s46, v42, 18 +; GFX11-NEXT: v_readlane_b32 s47, v42, 19 +; GFX11-NEXT: v_readlane_b32 s56, v42, 20 +; GFX11-NEXT: v_readlane_b32 s57, v42, 21 +; GFX11-NEXT: v_readlane_b32 s58, v42, 22 +; GFX11-NEXT: v_readlane_b32 s59, v42, 23 +; GFX11-NEXT: v_readlane_b32 s60, v42, 24 +; GFX11-NEXT: v_readlane_b32 s61, v42, 25 +; GFX11-NEXT: v_readlane_b32 s62, v42, 26 +; GFX11-NEXT: v_readlane_b32 s63, v42, 27 +; GFX11-NEXT: v_readlane_b32 s72, v42, 28 +; GFX11-NEXT: v_readlane_b32 s73, v42, 29 +; GFX11-NEXT: v_readlane_b32 s74, v42, 30 +; GFX11-NEXT: v_readlane_b32 s75, v42, 31 +; GFX11-NEXT: v_readlane_b32 s76, v43, 0 +; GFX11-NEXT: v_readlane_b32 s77, v43, 1 +; GFX11-NEXT: v_readlane_b32 s78, v43, 2 +; GFX11-NEXT: v_readlane_b32 s79, v43, 3 +; GFX11-NEXT: v_readlane_b32 s88, v43, 4 +; GFX11-NEXT: v_readlane_b32 s89, v43, 5 +; GFX11-NEXT: v_readlane_b32 s90, v43, 6 +; GFX11-NEXT: v_readlane_b32 s91, v43, 7 +; GFX11-NEXT: v_readlane_b32 s92, v43, 8 ; GFX11-NEXT: .LBB73_5: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v64 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: v_or_b32_e32 v64, v64, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v63 -; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-NEXT: s_and_b32 s0, s41, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s48, 8 +; GFX11-NEXT: s_lshl_b32 s4, s38, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s39, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v68 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v99 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s37, 8 +; GFX11-NEXT: s_lshl_b32 s6, s35, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s36, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v83 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: v_or_b32_e32 v65, v65, v69 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v98 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_lshl_b32 s6, s90, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v68, v69, v68 +; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_and_b32 v64, 0xff, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v97 +; GFX11-NEXT: v_mov_b32_e32 v97, s0 +; GFX11-NEXT: s_and_b32 s0, s27, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s34, 8 +; GFX11-NEXT: s_lshl_b32 s4, s104, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, vcc_hi, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_lshl_b32 s4, s92, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_and_b32_e32 v65, 0xffff, v65 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s91, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v96 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_or_b32_e32 v64, v64, v69 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v82 +; GFX11-NEXT: v_or_b32_e32 v96, v65, v68 +; GFX11-NEXT: v_dual_mov_b32 v81, s0 :: v_dual_lshlrev_b32 v68, 8, v81 +; GFX11-NEXT: s_and_b32 s0, s23, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v69, v83, v82 +; GFX11-NEXT: v_mov_b32_e32 v83, s2 +; GFX11-NEXT: s_lshl_b32 s2, s89, 8 +; GFX11-NEXT: s_lshl_b32 s4, s79, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s88, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s6, s78, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s4, s6 +; GFX11-NEXT: s_and_b32 s4, s77, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s76, 8 ; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v64 -; GFX11-NEXT: v_or_b32_e32 v81, v82, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v73 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v43 -; GFX11-NEXT: v_or_b32_e32 v80, v41, v80 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v65, v65, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v72 -; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v72, v64, v81 -; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v65 -; GFX11-NEXT: v_or_b32_e32 v82, v82, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v98, v64, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v87 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v80 ; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v82 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v61 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v81 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v181 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-NEXT: v_or_b32_e32 v73, v64, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v80 -; GFX11-NEXT: v_or_b32_e32 v65, v82, v41 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v182 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GFX11-NEXT: v_or_b32_e32 v74, v52, v64 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v65 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v80 -; GFX11-NEXT: v_or_b32_e32 v64, v81, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v57 -; GFX11-NEXT: v_or_b32_e32 v75, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v64 -; GFX11-NEXT: v_or_b32_e32 v49, v49, v65 -; GFX11-NEXT: v_or_b32_e32 v53, v70, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v176 -; GFX11-NEXT: v_or_b32_e32 v43, v48, v52 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; GFX11-NEXT: v_or_b32_e32 v36, v36, v64 -; GFX11-NEXT: v_or_b32_e32 v52, v65, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v47 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v44, v48, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 -; GFX11-NEXT: v_or_b32_e32 v37, v37, v53 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-NEXT: v_or_b32_e32 v49, v64, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v165 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v42 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v52 -; GFX11-NEXT: v_or_b32_e32 v52, v53, v64 -; GFX11-NEXT: v_or_b32_e32 v33, v33, v65 -; GFX11-NEXT: v_or_b32_e32 v45, v36, v48 -; GFX11-NEXT: v_or_b32_e32 v46, v37, v49 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v40 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v183 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v161 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_or_b32_e32 v52, v52, v64 +; GFX11-NEXT: v_or_b32_e32 v64, v65, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_or_b32_e32 v50, v50, v65 +; GFX11-NEXT: v_or_b32_e32 v65, v68, v69 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v80, v52, v64 +; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v70 ; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v179 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v178 -; GFX11-NEXT: v_or_b32_e32 v37, v37, v48 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v49 -; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 -; GFX11-NEXT: v_or_b32_e32 v29, v29, v64 -; GFX11-NEXT: v_or_b32_e32 v49, v65, v67 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GFX11-NEXT: v_or_b32_e32 v67, v32, v36 -; GFX11-NEXT: v_or_b32_e32 v68, v33, v37 -; GFX11-NEXT: v_or_b32_e32 v69, v28, v48 -; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-NEXT: v_or_b32_e32 v70, v29, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v66 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v166 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v144 -; GFX11-NEXT: v_or_b32_e32 v23, v23, v28 -; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 -; GFX11-NEXT: v_or_b32_e32 v24, v24, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v36, v37 -; GFX11-NEXT: v_or_b32_e32 v19, v19, v48 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v151 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v150 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v131 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v36 -; GFX11-NEXT: v_or_b32_e32 v33, v37, v48 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v49 -; GFX11-NEXT: v_or_b32_e32 v36, v52, v50 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v148, v23, v28 -; GFX11-NEXT: v_or_b32_e32 v150, v19, v32 -; GFX11-NEXT: v_or_b32_e32 v151, v20, v33 -; GFX11-NEXT: v_or_b32_e32 v130, v15, v36 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v147 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v146 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v145 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-NEXT: v_or_b32_e32 v82, v50, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-NEXT: v_or_b32_e32 v36, v29, v36 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-NEXT: v_or_b32_e32 v50, v52, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-NEXT: s_lshl_b32 s4, s73, 8 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v38 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v32 +; GFX11-NEXT: s_lshl_b32 s6, s62, 8 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v50 +; GFX11-NEXT: v_or_b32_e32 v50, v51, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v118 -; GFX11-NEXT: v_or_b32_e32 v149, v24, v29 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v134 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v23 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v133 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v132 -; GFX11-NEXT: v_or_b32_e32 v19, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v113 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v129 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v128 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v119 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v23 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v50, v35, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v49 +; GFX11-NEXT: v_mov_b32_e32 v49, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xff +; GFX11-NEXT: v_or_b32_e32 v13, v13, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v35 +; GFX11-NEXT: v_or_b32_e32 v35, v39, v51 +; GFX11-NEXT: v_mov_b32_e32 v51, s2 +; GFX11-NEXT: s_lshl_b32 s2, s75, 8 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 -; GFX11-NEXT: v_or_b32_e32 v23, v28, v29 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v32 -; GFX11-NEXT: v_or_b32_e32 v24, v33, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_or_b32_e32 v131, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v132, v13, v19 -; GFX11-NEXT: v_or_b32_e32 v133, v14, v20 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v103 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v102 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v116 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v114 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v112 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v23 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v24 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s74, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v37 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s4, s72, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-NEXT: v_or_b32_e32 v30, v30, v35 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v36 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s63, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v30 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_mov_b32_e32 v30, s0 +; GFX11-NEXT: v_or_b32_e32 v34, v25, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v33 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s0, s15, 0xff +; GFX11-NEXT: v_mov_b32_e32 v32, s2 +; GFX11-NEXT: s_lshl_b32 s2, s61, 8 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v25 +; GFX11-NEXT: v_or_b32_e32 v25, v26, v28 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s60, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s59, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: s_and_b32 s4, s13, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s58, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s4, s6 +; GFX11-NEXT: s_and_b32 s4, s57, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s56, 8 +; GFX11-NEXT: v_or_b32_e32 v36, v21, v25 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v27 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_mov_b32_e32 v35, s0 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v24 +; GFX11-NEXT: v_mov_b32_e32 v37, s2 +; GFX11-NEXT: s_and_b32 s0, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s47, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v23 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s46, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s45, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s4, s44, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s9, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s43, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s42, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v19 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_or_b32_e32 v19, v13, v18 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v98 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX11-NEXT: v_or_b32_e32 v14, v19, v20 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v97 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v101 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v86 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v23 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v100 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v99 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v19 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v24 -; GFX11-NEXT: v_or_b32_e32 v19, v26, v25 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: v_mov_b32_e32 v20, s2 +; GFX11-NEXT: s_and_b32 s0, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s40, 8 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s28, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s26, 8 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v13 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s24, 8 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_or_b32_e32 v16, v20, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v13, v9, v13 -; GFX11-NEXT: v_or_b32_e32 v14, v10, v14 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX11-NEXT: v_or_b32_e32 v9, v5, v19 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v96 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v21 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s4, s5 +; GFX11-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s20, 8 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v7, v10 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v55 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v83 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 -; GFX11-NEXT: v_or_b32_e32 v10, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v12, s0 +; GFX11-NEXT: v_mov_b32_e32 v14, s2 +; GFX11-NEXT: s_and_b32 s0, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s18, 8 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v71 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v27 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v35 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 -; GFX11-NEXT: v_or_b32_e32 v16, v18, v17 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 -; GFX11-NEXT: v_or_b32_e32 v17, v21, v22 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s14, 8 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: v_or_b32_e32 v4, v1, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s12, 8 +; GFX11-NEXT: s_and_b32 s3, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s8, 8 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX11-NEXT: s_or_b32 s1, s1, s2 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[72:75], off -; GFX11-NEXT: scratch_store_b128 v0, v[43:46], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v15 -; GFX11-NEXT: v_or_b32_e32 v2, v4, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v19, v16 -; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off +; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s1 ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_store_b128 v0, v[67:70], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[148:151], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[130:133], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[34:37], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v75, off, s32 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: v_readlane_b32 s104, v77, 8 -; GFX11-NEXT: v_readlane_b32 s103, v77, 7 -; GFX11-NEXT: v_readlane_b32 s102, v77, 6 -; GFX11-NEXT: v_readlane_b32 s101, v77, 5 -; GFX11-NEXT: v_readlane_b32 s100, v77, 4 -; GFX11-NEXT: v_readlane_b32 s99, v77, 3 -; GFX11-NEXT: v_readlane_b32 s98, v77, 2 -; GFX11-NEXT: v_readlane_b32 s97, v77, 1 -; GFX11-NEXT: v_readlane_b32 s96, v77, 0 -; GFX11-NEXT: v_readlane_b32 s87, v76, 31 -; GFX11-NEXT: v_readlane_b32 s86, v76, 30 -; GFX11-NEXT: v_readlane_b32 s85, v76, 29 -; GFX11-NEXT: v_readlane_b32 s84, v76, 28 -; GFX11-NEXT: v_readlane_b32 s83, v76, 27 -; GFX11-NEXT: v_readlane_b32 s82, v76, 26 -; GFX11-NEXT: v_readlane_b32 s81, v76, 25 -; GFX11-NEXT: v_readlane_b32 s80, v76, 24 -; GFX11-NEXT: v_readlane_b32 s71, v76, 23 -; GFX11-NEXT: v_readlane_b32 s70, v76, 22 -; GFX11-NEXT: v_readlane_b32 s69, v76, 21 -; GFX11-NEXT: v_readlane_b32 s68, v76, 20 -; GFX11-NEXT: v_readlane_b32 s67, v76, 19 -; GFX11-NEXT: v_readlane_b32 s66, v76, 18 -; GFX11-NEXT: v_readlane_b32 s65, v76, 17 -; GFX11-NEXT: v_readlane_b32 s64, v76, 16 -; GFX11-NEXT: v_readlane_b32 s55, v76, 15 -; GFX11-NEXT: v_readlane_b32 s54, v76, 14 -; GFX11-NEXT: v_readlane_b32 s53, v76, 13 -; GFX11-NEXT: v_readlane_b32 s52, v76, 12 -; GFX11-NEXT: v_readlane_b32 s51, v76, 11 -; GFX11-NEXT: v_readlane_b32 s50, v76, 10 -; GFX11-NEXT: v_readlane_b32 s49, v76, 9 -; GFX11-NEXT: v_readlane_b32 s48, v76, 8 -; GFX11-NEXT: v_readlane_b32 s39, v76, 7 -; GFX11-NEXT: v_readlane_b32 s38, v76, 6 -; GFX11-NEXT: v_readlane_b32 s37, v76, 5 -; GFX11-NEXT: v_readlane_b32 s36, v76, 4 -; GFX11-NEXT: v_readlane_b32 s35, v76, 3 -; GFX11-NEXT: v_readlane_b32 s34, v76, 2 -; GFX11-NEXT: v_readlane_b32 s31, v76, 1 -; GFX11-NEXT: v_readlane_b32 s30, v76, 0 +; GFX11-NEXT: v_readlane_b32 s104, v41, 8 +; GFX11-NEXT: v_readlane_b32 s103, v41, 7 +; GFX11-NEXT: v_readlane_b32 s102, v41, 6 +; GFX11-NEXT: v_readlane_b32 s101, v41, 5 +; GFX11-NEXT: v_readlane_b32 s100, v41, 4 +; GFX11-NEXT: v_readlane_b32 s99, v41, 3 +; GFX11-NEXT: v_readlane_b32 s98, v41, 2 +; GFX11-NEXT: v_readlane_b32 s97, v41, 1 +; GFX11-NEXT: v_readlane_b32 s96, v41, 0 +; GFX11-NEXT: v_readlane_b32 s87, v40, 31 +; GFX11-NEXT: v_readlane_b32 s86, v40, 30 +; GFX11-NEXT: v_readlane_b32 s85, v40, 29 +; GFX11-NEXT: v_readlane_b32 s84, v40, 28 +; GFX11-NEXT: v_readlane_b32 s83, v40, 27 +; GFX11-NEXT: v_readlane_b32 s82, v40, 26 +; GFX11-NEXT: v_readlane_b32 s81, v40, 25 +; GFX11-NEXT: v_readlane_b32 s80, v40, 24 +; GFX11-NEXT: v_readlane_b32 s71, v40, 23 +; GFX11-NEXT: v_readlane_b32 s70, v40, 22 +; GFX11-NEXT: v_readlane_b32 s69, v40, 21 +; GFX11-NEXT: v_readlane_b32 s68, v40, 20 +; GFX11-NEXT: v_readlane_b32 s67, v40, 19 +; GFX11-NEXT: v_readlane_b32 s66, v40, 18 +; GFX11-NEXT: v_readlane_b32 s65, v40, 17 +; GFX11-NEXT: v_readlane_b32 s64, v40, 16 +; GFX11-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -133942,40 +133475,68 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s6, v20 +; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s7, v20 +; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s8, v20 +; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s9, v20 +; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s20, v20 +; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s21, v20 +; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s24, v20 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s41, v20 +; SI-NEXT: v_mov_b32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s43, v20 ; SI-NEXT: v_readfirstlane_b32 s44, v1 ; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: v_readfirstlane_b32 s28, v3 +; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s26, v5 +; SI-NEXT: v_readfirstlane_b32 s27, v6 +; SI-NEXT: v_readfirstlane_b32 s22, v7 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s18, v9 +; SI-NEXT: v_readfirstlane_b32 s19, v10 +; SI-NEXT: v_readfirstlane_b32 s16, v11 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v13 +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 ; SI-NEXT: s_and_b64 s[46:47], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -133993,78 +133554,78 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s5, 0xffff0000 ; SI-NEXT: v_writelane_b32 v62, s46, 0 -; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: s_lshl_b32 s46, s5, 16 ; SI-NEXT: v_writelane_b32 v62, s46, 1 -; SI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s4, 0xffff0000 ; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: s_lshl_b32 s46, s16, 16 -; SI-NEXT: s_and_b32 s59, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s7, 16 -; SI-NEXT: s_and_b32 s57, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s6, 16 -; SI-NEXT: s_and_b32 s99, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s98, s5, 16 -; SI-NEXT: s_and_b32 s97, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s96, s4, 16 -; SI-NEXT: s_and_b32 s87, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s86, s9, 16 -; SI-NEXT: s_and_b32 s85, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s84, s8, 16 -; SI-NEXT: s_and_b32 s83, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s11, 16 -; SI-NEXT: s_and_b32 s81, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s80, s10, 16 -; SI-NEXT: s_and_b32 s71, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_and_b32 s69, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s12, 16 -; SI-NEXT: s_and_b32 s67, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s15, 16 -; SI-NEXT: s_and_b32 s65, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s14, 16 -; SI-NEXT: s_and_b32 s55, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s41, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s40, 16 -; SI-NEXT: s_and_b32 s51, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s43, 16 -; SI-NEXT: s_and_b32 s49, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s42, 16 +; SI-NEXT: s_lshl_b32 s46, s4, 16 +; SI-NEXT: s_and_b32 s59, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s13, 16 +; SI-NEXT: s_and_b32 s57, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s12, 16 +; SI-NEXT: s_and_b32 s99, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s11, 16 +; SI-NEXT: s_and_b32 s97, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s10, 16 +; SI-NEXT: s_and_b32 s87, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s15, 16 +; SI-NEXT: s_and_b32 s85, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s14, 16 +; SI-NEXT: s_and_b32 s83, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s17, 16 +; SI-NEXT: s_and_b32 s81, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s16, 16 +; SI-NEXT: s_and_b32 s71, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s19, 16 +; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: s_and_b32 s67, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s23, 16 +; SI-NEXT: s_and_b32 s65, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s22, 16 +; SI-NEXT: s_and_b32 s55, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s27, 16 +; SI-NEXT: s_and_b32 s53, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s26, 16 +; SI-NEXT: s_and_b32 s51, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s29, 16 +; SI-NEXT: s_and_b32 s49, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s28, 16 ; SI-NEXT: s_and_b32 s39, s45, 0xffff0000 ; SI-NEXT: s_lshl_b32 s38, s45, 16 ; SI-NEXT: s_and_b32 s37, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s36, s44, 16 -; SI-NEXT: s_and_b32 s35, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s27, 16 -; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s26, 16 +; SI-NEXT: s_and_b32 s35, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s43, 16 +; SI-NEXT: s_and_b32 s31, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s42, 16 +; SI-NEXT: s_and_b32 s95, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s41, 16 +; SI-NEXT: s_and_b32 s93, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s40, 16 ; SI-NEXT: s_and_b32 s91, s25, 0xffff0000 ; SI-NEXT: s_lshl_b32 s90, s25, 16 ; SI-NEXT: s_and_b32 s89, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s88, s24, 16 -; SI-NEXT: s_and_b32 s79, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s23, 16 -; SI-NEXT: s_and_b32 s77, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s22, 16 -; SI-NEXT: s_and_b32 s75, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s21, 16 -; SI-NEXT: s_and_b32 s73, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s20, 16 -; SI-NEXT: s_and_b32 s63, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_and_b32 s79, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s21, 16 +; SI-NEXT: s_and_b32 s77, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s20, 16 +; SI-NEXT: s_and_b32 s75, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s9, 16 +; SI-NEXT: s_and_b32 s73, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s8, 16 +; SI-NEXT: s_and_b32 s63, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s7, 16 +; SI-NEXT: s_and_b32 s61, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s6, 16 ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -134080,23 +133641,23 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[51:52], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[42:43], 1.0 ; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[23:24], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[59:60], s[6:7], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 ; SI-NEXT: s_waitcnt expcnt(0) @@ -142563,25 +142124,21 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_and_b64 s[46:47], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v31, s22 +; SI-NEXT: v_mov_b32_e32 v32, s23 +; SI-NEXT: v_mov_b32_e32 v29, s24 +; SI-NEXT: v_mov_b32_e32 v30, s25 +; SI-NEXT: v_mov_b32_e32 v25, s26 +; SI-NEXT: v_mov_b32_e32 v26, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v24, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -142600,500 +142157,622 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB81_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s46, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 -; SI-NEXT: s_lshr_b32 s46, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s46 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s46 -; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 -; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 -; SI-NEXT: s_lshr_b32 s46, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 -; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 -; SI-NEXT: s_lshr_b32 s46, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 -; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 -; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 -; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: s_lshr_b32 s46, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 -; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 -; SI-NEXT: s_lshr_b32 s46, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s46 -; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s46 -; SI-NEXT: s_lshr_b32 s46, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s46 -; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 -; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 -; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 -; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 -; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 -; SI-NEXT: s_lshr_b32 s46, s19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: s_lshr_b32 s46, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 -; SI-NEXT: s_lshr_b32 s46, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s46 -; SI-NEXT: s_lshr_b32 s46, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB81_3 -; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[52:53], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_add_f64 v[48:49], s[26:27], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_add_f64 v[36:37], s[28:29], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 -; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v63, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_mov_b32_e32 v15, v12 -; SI-NEXT: v_mov_b32_e32 v12, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 -; SI-NEXT: v_mov_b32_e32 v14, v5 -; SI-NEXT: v_mov_b32_e32 v5, v40 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v7, v42 -; SI-NEXT: v_mov_b32_e32 v42, v20 -; SI-NEXT: v_mov_b32_e32 v20, v21 -; SI-NEXT: v_mov_b32_e32 v21, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f64 v[33:34], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[1:2], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_add_f64 v[1:2], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 -; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 +; SI-NEXT: v_mov_b32_e32 v48, v16 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB81_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -143117,73 +142796,103 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_branch .LBB81_2 ; ; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: @@ -147874,23 +147583,51 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 ; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 ; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 ; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 ; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 ; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 ; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 ; VI-NEXT: v_readfirstlane_b32 s15, v11 -; VI-NEXT: v_readfirstlane_b32 s40, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v13 -; VI-NEXT: v_readfirstlane_b32 s42, v14 -; VI-NEXT: v_readfirstlane_b32 s43, v15 -; VI-NEXT: v_readfirstlane_b32 s44, v16 -; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s17, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: v_readfirstlane_b32 s24, v3 +; VI-NEXT: v_readfirstlane_b32 s25, v4 +; VI-NEXT: v_readfirstlane_b32 s26, v5 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_readfirstlane_b32 s43, v12 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_readfirstlane_b32 s45, v14 ; VI-NEXT: v_readfirstlane_b32 s46, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s47, v1 @@ -147907,8 +147644,38 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s5, s45, 3 ; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -147977,38 +147744,8 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s45, 3 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s44, 3 -; VI-NEXT: s_add_i32 s45, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s43, 3 -; VI-NEXT: s_add_i32 s44, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s42, 3 -; VI-NEXT: s_add_i32 s43, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s41, 3 -; VI-NEXT: s_add_i32 s42, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s40, 3 -; VI-NEXT: s_add_i32 s41, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s15, 3 -; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -148059,20 +147796,20 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB87_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s22 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s24 +; VI-NEXT: v_mov_b32_e32 v3, s25 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: v_mov_b32_e32 v5, s27 +; VI-NEXT: v_mov_b32_e32 v6, s28 +; VI-NEXT: v_mov_b32_e32 v7, s29 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s41 +; VI-NEXT: v_mov_b32_e32 v10, s42 +; VI-NEXT: v_mov_b32_e32 v11, s43 +; VI-NEXT: v_mov_b32_e32 v12, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s46 ; VI-NEXT: v_mov_b32_e32 v15, s47 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -148085,12 +147822,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v23, s13 ; VI-NEXT: v_mov_b32_e32 v24, s14 ; VI-NEXT: v_mov_b32_e32 v25, s15 -; VI-NEXT: v_mov_b32_e32 v26, s40 -; VI-NEXT: v_mov_b32_e32 v27, s41 -; VI-NEXT: v_mov_b32_e32 v28, s42 -; VI-NEXT: v_mov_b32_e32 v29, s43 -; VI-NEXT: v_mov_b32_e32 v30, s44 -; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: v_mov_b32_e32 v26, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v28, s18 +; VI-NEXT: v_mov_b32_e32 v29, s19 +; VI-NEXT: v_mov_b32_e32 v30, s20 +; VI-NEXT: v_mov_b32_e32 v31, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: ; VI-NEXT: s_branch .LBB87_2 @@ -165370,172 +165107,171 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:120 +; GFX11-TRUE16-NEXT: s_clause 0x1a ; 108-byte Folded Spill +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:12 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr155_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr154_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 @@ -165554,60 +165290,60 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[133:134], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v81 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 24, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v81 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v80 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v80 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28] @@ -165615,70 +165351,70 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v136.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v127.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v105.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v139.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v155.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v152.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v138.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v24.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v80.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v80.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v81.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v81.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 @@ -165701,151 +165437,142 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v32 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v32 :: v_dual_lshlrev_b32 v31, 16, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v31, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v31, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v31, v38, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v17, 16, v17 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_cndmask_b32 v32, v35, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v37, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v37, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v148.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v33, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v17, v34, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v149.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v17, v33 :: v_dual_and_b32 v18, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v17, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v33, v20, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v20, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v22, 16, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v151.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v19, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v151.h ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v17, v24, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v160, v17, v24 :: v_dual_lshlrev_b32 v21, 16, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v160.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v19, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v161.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v17, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v161.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -165859,9 +165586,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v28 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v163.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 8, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -165874,9 +165601,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v160.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v163.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -165889,10 +165617,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v162.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v165.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -165905,10 +165633,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v38 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v165.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 24, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -165922,9 +165650,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v81 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v167.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v51 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -165938,9 +165666,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v80 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v164.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v167.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -165953,10 +165681,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v177.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v49 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v49 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v162.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 @@ -165969,9 +165697,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v177.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38] ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v17, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 @@ -165982,12 +165712,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v179.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v55 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v19, v21, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -166000,10 +165731,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v179.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v17, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v17, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -166018,9 +165749,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v166.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v181.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v2, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v164.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 24, v65 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v2, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v3 @@ -166030,13 +165761,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v181.h ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v176.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v1, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v166.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 24, v67 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v1, v18, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6 @@ -166047,13 +165778,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v183.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v67 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v65 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v2, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 @@ -166067,25 +165798,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v69.l, v183.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v5, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 24, v69 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v1, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v41.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v69 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v45, v3, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v71, v1, v8 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -166096,8 +165828,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v46.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v45.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v176.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 @@ -166105,14 +165837,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v63, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v178.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v74, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v63.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 @@ -166126,30 +165859,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v182.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v74.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v72.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v73.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v178.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[70:71] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v2, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[70:71] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53] ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v104.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v106, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v95.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v105, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v180.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v180.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v3, v7, vcc_lo @@ -166157,7 +165891,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v106.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v105.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[84:85] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5 @@ -166168,8 +165902,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v136, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[64:65] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v127, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -166177,19 +165911,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v139, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v138, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v139.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v130, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v138.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 @@ -166197,11 +165931,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v102.l, v136.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v40.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v40.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v127.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v152, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 @@ -166210,141 +165944,135 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v153.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v152.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v2, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[130:131] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v155, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v182.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[128:129] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[66:67] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[68:69] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[64:65] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v4, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v154.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v131 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v155.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v148 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v148 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v131 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v130 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[147:148] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v153.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[133:134], 24, v[68:69] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[66:67] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v146, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v154.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v147 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v147 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 24, v129 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[146:147] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v147 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v103 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v103 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v102 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v85 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v85 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v84 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v71 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v71 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v70 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v67 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v67 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v66 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v65 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v65 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v64 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v55 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v55 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v52 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v146 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v129 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v128 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v102 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 24, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v84 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 24, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v68 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 8, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v64 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v52 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v181.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v152.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v180.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v143.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v180.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v143.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v142.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v141.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v183.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v140.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v140.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v182.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v139.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v134.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v4.h ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v182.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v127.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v181.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v126.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v125.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v41.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v123.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v124.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v40.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v122.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v121.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v183.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v120.l ; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h ; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v7.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.l, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v111.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v72.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v109.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v110.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v63.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v108.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v46.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v107.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v45.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v106.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v104.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v7.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.h, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.h, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v92.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v95.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v91.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v89.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v136.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v79.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v127.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v78.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v102.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v114.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v106.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v77.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v105.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v76.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v74.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v84.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v74.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v91.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v73.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v90.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l ; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v153.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v62.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v152.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v61.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v139.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v61.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v59.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v155.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v138.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v58.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v154.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v56.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v10.l, v10.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v11.l, v11.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h @@ -166352,61 +166080,61 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v146.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v154.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v47.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v44.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v142.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v153.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v46.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v141.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v138.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v137.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v137.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v126.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v136.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v125.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v124.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v123.l ; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h ; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h ; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h ; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v122.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v120.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v121.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v111.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v110.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v109.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v108.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v107.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v95.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v94.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v93.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v93.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v90.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v92.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v89.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h @@ -166414,12 +166142,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v79.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v78.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v167.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v77.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v166.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v75.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l ; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l @@ -166427,31 +166155,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l ; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v166.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v73.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v72.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v63.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v177.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v62.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v176.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v59.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v176.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v58.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v167.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v57.l ; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h ; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h ; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h ; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h ; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v56.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v45.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v47.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v44.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v178.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v177.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v42.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v41.l ; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l ; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l @@ -166465,67 +166193,66 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload -; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:136 +; GFX11-TRUE16-NEXT: s_clause 0x1a ; 108-byte Folded Reload +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:244 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -169635,6447 +169362,5203 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-LABEL: bitcast_v64bf16_to_v128i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v63, s30, 0 -; VI-NEXT: v_writelane_b32 v63, s31, 1 -; VI-NEXT: v_writelane_b32 v63, s34, 2 -; VI-NEXT: v_writelane_b32 v63, s35, 3 -; VI-NEXT: v_writelane_b32 v63, s36, 4 -; VI-NEXT: v_writelane_b32 v63, s37, 5 -; VI-NEXT: v_writelane_b32 v63, s38, 6 -; VI-NEXT: v_writelane_b32 v63, s39, 7 -; VI-NEXT: v_writelane_b32 v63, s48, 8 -; VI-NEXT: v_writelane_b32 v63, s49, 9 -; VI-NEXT: v_writelane_b32 v63, s50, 10 -; VI-NEXT: v_writelane_b32 v63, s51, 11 -; VI-NEXT: v_writelane_b32 v63, s52, 12 -; VI-NEXT: v_writelane_b32 v63, s53, 13 -; VI-NEXT: v_writelane_b32 v63, s54, 14 -; VI-NEXT: v_writelane_b32 v63, s55, 15 -; VI-NEXT: v_writelane_b32 v63, s64, 16 -; VI-NEXT: v_writelane_b32 v63, s65, 17 -; VI-NEXT: v_writelane_b32 v63, s66, 18 -; VI-NEXT: v_writelane_b32 v63, s67, 19 -; VI-NEXT: v_writelane_b32 v63, s68, 20 -; VI-NEXT: v_writelane_b32 v63, s69, 21 -; VI-NEXT: v_writelane_b32 v63, s70, 22 -; VI-NEXT: v_writelane_b32 v63, s71, 23 -; VI-NEXT: v_writelane_b32 v63, s80, 24 -; VI-NEXT: v_writelane_b32 v63, s81, 25 -; VI-NEXT: v_writelane_b32 v63, s82, 26 -; VI-NEXT: v_writelane_b32 v63, s83, 27 -; VI-NEXT: v_writelane_b32 v63, s84, 28 -; VI-NEXT: v_writelane_b32 v63, s85, 29 -; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s48, v3 -; VI-NEXT: v_readfirstlane_b32 s49, v4 -; VI-NEXT: v_readfirstlane_b32 s38, v5 -; VI-NEXT: v_readfirstlane_b32 s39, v6 -; VI-NEXT: v_readfirstlane_b32 s36, v7 -; VI-NEXT: v_readfirstlane_b32 s37, v8 -; VI-NEXT: v_readfirstlane_b32 s34, v9 -; VI-NEXT: v_readfirstlane_b32 s35, v10 -; VI-NEXT: v_readfirstlane_b32 s30, v11 -; VI-NEXT: v_readfirstlane_b32 s31, v12 -; VI-NEXT: v_readfirstlane_b32 s90, v13 -; VI-NEXT: v_readfirstlane_b32 s91, v14 -; VI-NEXT: v_readfirstlane_b32 s88, v15 -; VI-NEXT: v_readfirstlane_b32 s89, v16 -; VI-NEXT: v_readfirstlane_b32 s76, v17 -; VI-NEXT: v_readfirstlane_b32 s77, v18 -; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s26, v9 +; VI-NEXT: v_readfirstlane_b32 s27, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v11 +; VI-NEXT: v_readfirstlane_b32 s25, v12 +; VI-NEXT: v_readfirstlane_b32 s22, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s20, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB91_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s6, s5, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 26 -; VI-NEXT: s_lshr_b32 s6, s5, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 27 -; VI-NEXT: s_lshr_b32 s6, s5, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 28 -; VI-NEXT: s_lshr_b32 s6, s4, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 29 -; VI-NEXT: s_lshr_b32 s6, s4, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 30 -; VI-NEXT: s_lshr_b32 s6, s29, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 31 -; VI-NEXT: s_lshr_b32 s6, s29, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 32 -; VI-NEXT: s_lshr_b32 s6, s29, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 33 -; VI-NEXT: s_lshr_b32 s6, s28, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 34 -; VI-NEXT: s_lshr_b32 s6, s28, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 35 -; VI-NEXT: s_lshr_b32 s6, s27, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 36 -; VI-NEXT: s_lshr_b32 s6, s27, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 37 -; VI-NEXT: s_lshr_b32 s6, s27, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 38 -; VI-NEXT: s_lshr_b32 s6, s26, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 39 -; VI-NEXT: s_lshr_b32 s6, s26, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 40 -; VI-NEXT: s_lshr_b32 s6, s25, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 41 -; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 42 -; VI-NEXT: s_lshr_b32 s6, s25, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 43 -; VI-NEXT: s_lshr_b32 s6, s24, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 44 -; VI-NEXT: s_lshr_b32 s6, s24, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 45 -; VI-NEXT: s_lshr_b32 s6, s23, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 46 -; VI-NEXT: s_lshr_b32 s6, s23, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 47 -; VI-NEXT: s_lshr_b32 s6, s23, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 48 -; VI-NEXT: s_lshr_b32 s6, s22, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 49 -; VI-NEXT: s_lshr_b32 s6, s22, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 50 -; VI-NEXT: s_lshr_b32 s6, s21, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 51 -; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 52 -; VI-NEXT: s_lshr_b32 s6, s21, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 53 -; VI-NEXT: s_lshr_b32 s6, s20, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 54 -; VI-NEXT: s_lshr_b32 s6, s20, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 55 -; VI-NEXT: s_lshr_b32 s6, s19, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 56 -; VI-NEXT: s_lshr_b32 s6, s19, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 57 -; VI-NEXT: s_lshr_b32 s6, s19, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 58 -; VI-NEXT: s_lshr_b32 s6, s18, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 59 -; VI-NEXT: s_lshr_b32 s6, s18, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 60 -; VI-NEXT: s_lshr_b32 s6, s17, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 61 -; VI-NEXT: s_lshr_b32 s6, s17, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 62 -; VI-NEXT: s_lshr_b32 s6, s17, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 63 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s6, 0 -; VI-NEXT: s_lshr_b32 s6, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s6, 1 -; VI-NEXT: s_lshr_b32 s6, s39, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 18 -; VI-NEXT: s_lshr_b32 s6, s39, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 19 -; VI-NEXT: s_lshr_b32 s6, s39, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 20 -; VI-NEXT: s_lshr_b32 s6, s38, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 16 -; VI-NEXT: s_lshr_b32 s6, s38, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 17 -; VI-NEXT: s_lshr_b32 s6, s49, 24 -; VI-NEXT: v_writelane_b32 v61, s6, 23 -; VI-NEXT: s_lshr_b32 s6, s49, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 24 -; VI-NEXT: s_lshr_b32 s6, s49, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 25 -; VI-NEXT: s_lshr_b32 s6, s48, 16 -; VI-NEXT: v_writelane_b32 v61, s6, 21 -; VI-NEXT: s_lshr_b32 s6, s48, 8 -; VI-NEXT: v_writelane_b32 v61, s6, 22 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 59 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s46, 60 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 61 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s46, 62 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v22, s46, 63 +; VI-NEXT: s_lshr_b32 s46, s12, 8 ; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 14 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 15 -; VI-NEXT: s_lshr_b64 vcc, s[28:29], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 12 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 13 -; VI-NEXT: s_lshr_b64 vcc, s[26:27], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 10 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 11 -; VI-NEXT: s_lshr_b64 vcc, s[24:25], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 8 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 9 -; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 6 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 7 -; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 4 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 5 -; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 2 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 3 -; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 -; VI-NEXT: v_writelane_b32 v61, vcc_lo, 0 -; VI-NEXT: s_lshr_b32 s87, s77, 24 -; VI-NEXT: s_lshr_b32 s43, s77, 16 -; VI-NEXT: s_lshr_b32 s42, s77, 8 -; VI-NEXT: s_lshr_b32 s13, s76, 16 -; VI-NEXT: s_lshr_b32 s11, s76, 8 -; VI-NEXT: s_lshr_b32 s86, s89, 24 -; VI-NEXT: s_lshr_b32 s85, s89, 16 -; VI-NEXT: s_lshr_b32 s84, s89, 8 -; VI-NEXT: s_lshr_b32 s9, s88, 16 -; VI-NEXT: s_lshr_b32 s7, s88, 8 -; VI-NEXT: s_lshr_b32 s75, s91, 24 -; VI-NEXT: s_lshr_b32 s74, s91, 16 -; VI-NEXT: s_lshr_b32 s73, s91, 8 -; VI-NEXT: s_lshr_b32 s79, s90, 16 -; VI-NEXT: s_lshr_b32 s78, s90, 8 -; VI-NEXT: s_lshr_b32 s60, s31, 24 -; VI-NEXT: s_lshr_b32 s15, s31, 16 -; VI-NEXT: s_lshr_b32 s14, s31, 8 -; VI-NEXT: s_lshr_b32 s72, s30, 16 -; VI-NEXT: s_lshr_b32 s61, s30, 8 -; VI-NEXT: s_lshr_b32 s63, s35, 24 -; VI-NEXT: s_lshr_b32 s57, s35, 16 -; VI-NEXT: s_lshr_b32 s56, s35, 8 -; VI-NEXT: s_lshr_b32 s83, s34, 16 -; VI-NEXT: s_lshr_b32 s82, s34, 8 -; VI-NEXT: s_lshr_b32 s41, s37, 24 -; VI-NEXT: s_lshr_b32 s47, s37, 16 -; VI-NEXT: s_lshr_b32 s46, s37, 8 -; VI-NEXT: s_lshr_b32 s59, s36, 16 -; VI-NEXT: s_lshr_b32 s45, s36, 8 -; VI-NEXT: v_writelane_b32 v61, vcc_hi, 1 -; VI-NEXT: s_lshr_b64 s[50:51], s[76:77], 24 -; VI-NEXT: s_lshr_b64 s[52:53], s[88:89], 24 -; VI-NEXT: s_lshr_b64 s[54:55], s[90:91], 24 -; VI-NEXT: s_lshr_b64 s[64:65], s[30:31], 24 -; VI-NEXT: s_lshr_b64 s[66:67], s[34:35], 24 -; VI-NEXT: s_lshr_b64 s[68:69], s[36:37], 24 -; VI-NEXT: s_lshr_b64 s[70:71], s[38:39], 24 -; VI-NEXT: s_lshr_b64 s[80:81], s[48:49], 24 -; VI-NEXT: s_mov_b32 s6, s17 -; VI-NEXT: s_mov_b32 s8, s19 -; VI-NEXT: s_mov_b32 s10, s21 -; VI-NEXT: s_mov_b32 s12, s23 -; VI-NEXT: s_mov_b32 s40, s25 -; VI-NEXT: s_mov_b32 s44, s27 -; VI-NEXT: s_mov_b32 s58, s29 -; VI-NEXT: s_mov_b32 s62, s5 -; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v21, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; VI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v21, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 2 +; VI-NEXT: v_writelane_b32 v21, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 0 +; VI-NEXT: s_lshr_b32 s47, s43, 24 +; VI-NEXT: s_lshr_b32 s57, s43, 16 +; VI-NEXT: s_lshr_b32 s61, s43, 8 +; VI-NEXT: s_lshr_b32 s75, s42, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 8 +; VI-NEXT: s_lshr_b32 s89, s45, 24 +; VI-NEXT: s_lshr_b32 s91, s45, 16 +; VI-NEXT: s_lshr_b32 s31, s45, 8 +; VI-NEXT: s_lshr_b32 s37, s44, 16 +; VI-NEXT: s_lshr_b32 s49, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s63, s41, 24 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s77, s41, 8 +; VI-NEXT: s_lshr_b32 s53, s40, 16 +; VI-NEXT: s_lshr_b32 s65, s40, 8 +; VI-NEXT: s_lshr_b64 s[80:81], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[82:83], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[86:87], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 1 +; VI-NEXT: s_lshr_b64 s[68:69], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[70:71], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[84:85], s[40:41], 24 +; VI-NEXT: s_mov_b32 s78, s45 +; VI-NEXT: s_mov_b32 s88, s43 +; VI-NEXT: s_mov_b32 s90, s29 +; VI-NEXT: s_mov_b32 s30, s27 +; VI-NEXT: s_mov_b32 s36, s25 +; VI-NEXT: s_mov_b32 s48, s23 +; VI-NEXT: s_mov_b32 s52, s21 +; VI-NEXT: s_mov_b32 s64, s19 +; VI-NEXT: s_mov_b32 s46, s41 +; VI-NEXT: s_mov_b32 s56, s17 +; VI-NEXT: s_mov_b32 s58, s15 +; VI-NEXT: s_mov_b32 s60, s13 +; VI-NEXT: s_mov_b32 s62, s11 +; VI-NEXT: s_mov_b32 s72, s9 +; VI-NEXT: s_mov_b32 s74, s7 +; VI-NEXT: s_mov_b32 s76, s5 +; VI-NEXT: s_cbranch_execnz .LBB91_3 ; VI-NEXT: .LBB91_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s6, s49, 16 -; VI-NEXT: v_mov_b32_e32 v25, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v25 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s49, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v25 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: s_lshl_b32 s46, s41, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s46, v1 +; VI-NEXT: v_readfirstlane_b32 s46, v2 +; VI-NEXT: s_bfe_u32 s47, s46, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s46 +; VI-NEXT: s_add_i32 s56, s47, 0x7fff +; VI-NEXT: s_or_b32 s57, s46, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[1:2] -; VI-NEXT: s_lshl_b32 s6, s48, 16 -; VI-NEXT: v_add_f32_e32 v2, s6, v25 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_cselect_b32 s46, s57, s56 +; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s41, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: s_bfe_u32 s47, s41, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s41 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s41, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[56:57], vcc, exec +; VI-NEXT: s_cselect_b32 s41, s41, s47 +; VI-NEXT: s_lshr_b32 s47, s41, 16 +; VI-NEXT: s_lshl_b32 s41, s40, 16 +; VI-NEXT: v_add_f32_e32 v2, s41, v1 +; VI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: s_bfe_u32 s47, s41, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s41 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s41, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[56:57], vcc, exec +; VI-NEXT: s_cselect_b32 s56, s41, s47 +; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s40, v1 +; VI-NEXT: v_readfirstlane_b32 s40, v2 +; VI-NEXT: s_bfe_u32 s41, s40, 0x10010 +; VI-NEXT: s_add_i32 s41, s41, s40 +; VI-NEXT: s_add_i32 s47, s41, 0x7fff +; VI-NEXT: s_or_b32 s57, s40, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s6, s48, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v25 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s6, s39, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v25 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s6, s39, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v25 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: s_lshl_b32 s6, s38, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v25 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s38, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v25 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_lshl_b32 s6, s37, 16 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v25 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s37, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v25 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[7:8] -; VI-NEXT: s_lshl_b32 s6, s36, 16 -; VI-NEXT: v_add_f32_e32 v8, s6, v25 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: s_and_b32 s6, s36, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v25 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_lshl_b32 s6, s35, 16 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v25 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: s_and_b32 s6, s35, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v25 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] -; VI-NEXT: s_lshl_b32 s6, s34, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v25 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s34, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v25 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: s_lshl_b32 s6, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v25 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v25 -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] -; VI-NEXT: s_lshl_b32 s6, s30, 16 -; VI-NEXT: v_add_f32_e32 v14, s6, v25 -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: s_and_b32 s6, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; VI-NEXT: v_add_f32_e32 v15, s6, v25 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_lshl_b32 s6, s91, 16 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v25 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_and_b32 s6, s91, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s6, v25 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] -; VI-NEXT: s_lshl_b32 s6, s90, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v25 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s90, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_add_f32_e32 v18, s6, v25 -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: s_lshl_b32 s6, s89, 16 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_add_f32_e32 v19, s6, v25 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s89, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_add_f32_e32 v20, s6, v25 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] -; VI-NEXT: s_lshl_b32 s6, s88, 16 -; VI-NEXT: v_add_f32_e32 v20, s6, v25 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: s_and_b32 s6, s88, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; VI-NEXT: v_add_f32_e32 v21, s6, v25 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_lshl_b32 s6, s77, 16 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_add_f32_e32 v22, s6, v25 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: s_and_b32 s6, s77, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_add_f32_e32 v23, s6, v25 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] -; VI-NEXT: s_lshl_b32 s6, s76, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v25 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s76, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc -; VI-NEXT: v_add_f32_e32 v24, s6, v25 -; VI-NEXT: v_bfe_u32 v26, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v24 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc -; VI-NEXT: v_add_f32_e32 v26, s6, v25 -; VI-NEXT: v_readfirstlane_b32 s6, v26 -; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s6 -; VI-NEXT: s_add_i32 s8, s7, 0x7fff -; VI-NEXT: s_or_b32 s9, s6, 0x400000 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec -; VI-NEXT: s_cselect_b32 s6, s9, s8 -; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s10 -; VI-NEXT: s_lshr_b32 s7, s7, 16 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 -; VI-NEXT: s_lshl_b32 s7, s16, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s8, s7, s10 -; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s9, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s19, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s8, s7, s10 -; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s9, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s18, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s10, s7, s9 -; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s11, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s21, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s10, s7, s9 -; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s11, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s20, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s12, s7, s9 -; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s13, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s23, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s12, s7, s9 -; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s13, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s22, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[22:23], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s25, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s24, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[24:25], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s27, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s26, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[26:27], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s29, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s28, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_cselect_b32 s40, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: s_lshr_b32 s57, s40, 16 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_lshr_b64 s[40:41], s[56:57], 16 +; VI-NEXT: s_bfe_u32 s56, s47, 0x10010 +; VI-NEXT: s_add_i32 s56, s56, s47 +; VI-NEXT: s_add_i32 s58, s56, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[56:57], vcc, exec +; VI-NEXT: s_cselect_b32 s56, s47, s58 +; VI-NEXT: s_and_b32 s17, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s47, s17, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s17 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s17, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[58:59], vcc, exec +; VI-NEXT: s_cselect_b32 s17, s17, s47 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshl_b32 s17, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s47, s17, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s17 +; VI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s17, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[58:59], vcc, exec +; VI-NEXT: s_cselect_b32 s58, s17, s47 +; VI-NEXT: s_and_b32 s16, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s16, v1 +; VI-NEXT: v_readfirstlane_b32 s16, v2 +; VI-NEXT: s_bfe_u32 s17, s16, 0x10010 +; VI-NEXT: s_add_i32 s17, s17, s16 +; VI-NEXT: s_add_i32 s47, s17, 0x7fff +; VI-NEXT: s_or_b32 s57, s16, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[16:17], vcc, exec +; VI-NEXT: s_cselect_b32 s16, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[16:17], s[58:59], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[58:59], vcc, exec +; VI-NEXT: s_cselect_b32 s58, s47, s57 +; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s47, s15, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s15 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s15, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[60:61], vcc, exec +; VI-NEXT: s_cselect_b32 s15, s15, s47 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshl_b32 s15, s14, 16 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s47, s15, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s15 +; VI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s15, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[60:61], vcc, exec +; VI-NEXT: s_cselect_b32 s60, s15, s47 +; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_add_i32 s47, s15, 0x7fff +; VI-NEXT: s_or_b32 s57, s14, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_cselect_b32 s14, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s13, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s61, s14, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[14:15], s[60:61], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[60:61], vcc, exec +; VI-NEXT: s_cselect_b32 s60, s47, s57 +; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s47, s13, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s13 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s13, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[62:63], vcc, exec +; VI-NEXT: s_cselect_b32 s13, s13, s47 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshl_b32 s13, s12, 16 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s47, s13, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s13 +; VI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s13, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[62:63], vcc, exec +; VI-NEXT: s_cselect_b32 s62, s13, s47 +; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_add_i32 s47, s13, 0x7fff +; VI-NEXT: s_or_b32 s57, s12, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s11, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s63, s12, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[12:13], s[62:63], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[62:63], vcc, exec +; VI-NEXT: s_cselect_b32 s62, s47, s57 +; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s47, s11, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s11 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s11, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[72:73], vcc, exec +; VI-NEXT: s_cselect_b32 s11, s11, s47 +; VI-NEXT: s_lshr_b32 s63, s11, 16 +; VI-NEXT: s_lshl_b32 s11, s10, 16 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s47, s11, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s11 +; VI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s11, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[72:73], vcc, exec +; VI-NEXT: s_cselect_b32 s72, s11, s47 +; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_add_i32 s47, s11, 0x7fff +; VI-NEXT: s_or_b32 s57, s10, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s9, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s73, s10, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[10:11], s[72:73], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[72:73], vcc, exec +; VI-NEXT: s_cselect_b32 s72, s47, s57 +; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s47, s9, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s9 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s9, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[74:75], vcc, exec +; VI-NEXT: s_cselect_b32 s9, s9, s47 +; VI-NEXT: s_lshr_b32 s73, s9, 16 +; VI-NEXT: s_lshl_b32 s9, s8, 16 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s47, s9, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s9 +; VI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s9, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[74:75], vcc, exec +; VI-NEXT: s_cselect_b32 s74, s9, s47 +; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_add_i32 s47, s9, 0x7fff +; VI-NEXT: s_or_b32 s57, s8, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s7, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s75, s8, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[8:9], s[74:75], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[74:75], vcc, exec +; VI-NEXT: s_cselect_b32 s74, s47, s57 +; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s47, s7, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s7 +; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[28:29], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s5, 16 -; VI-NEXT: v_add_f32_e32 v26, s7, v25 -; VI-NEXT: v_readfirstlane_b32 s7, v26 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[28:29], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[76:77], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s47 +; VI-NEXT: s_lshr_b32 s75, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s6, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s47, s7, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s7 +; VI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: s_bitset1_b32 s7, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[76:77], vcc, exec +; VI-NEXT: s_cselect_b32 s76, s7, s47 +; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s47, s7, 0x7fff +; VI-NEXT: s_or_b32 s57, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s77, s6, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[6:7], s[76:77], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[76:77], vcc, exec +; VI-NEXT: s_cselect_b32 s76, s47, s57 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v26, s5, v25 -; VI-NEXT: v_readfirstlane_b32 s5, v26 -; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s5 -; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s47, s5, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s5 +; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: s_bitset1_b32 s5, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s5, s5, s7 -; VI-NEXT: s_lshr_b32 s15, s5, 16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[78:79], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s47 +; VI-NEXT: s_lshr_b32 s77, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_add_f32_e32 v26, s5, v25 -; VI-NEXT: v_readfirstlane_b32 s5, v26 -; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s5 -; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 -; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s47, s5, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s5 +; VI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: s_bitset1_b32 s5, 22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] -; VI-NEXT: s_cselect_b32 s14, s5, s7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[78:79], vcc, exec +; VI-NEXT: s_cselect_b32 s78, s5, s47 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v24, v22 -; VI-NEXT: v_add_f32_e32 v25, s4, v25 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_readfirstlane_b32 s4, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[20:21] -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_mov_b32_e32 v18, v16 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[17:18] -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] -; VI-NEXT: v_mov_b32_e32 v15, v13 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v2 ; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[14:15] -; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] ; VI-NEXT: s_add_i32 s5, s5, s4 -; VI-NEXT: v_mov_b32_e32 v12, v10 -; VI-NEXT: s_add_i32 s7, s5, 0x7fff -; VI-NEXT: s_or_b32 s9, s4, 0x400000 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_add_i32 s47, s5, 0x7fff +; VI-NEXT: s_or_b32 s57, s4, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] -; VI-NEXT: s_cselect_b32 s4, s9, s7 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: s_lshr_b32 s15, s4, 16 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[8:9] -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_mov_b32 s27, s44 -; VI-NEXT: s_mov_b32 s29, s58 -; VI-NEXT: s_mov_b32 s5, s62 -; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[5:6] -; VI-NEXT: s_mov_b32 s17, s6 -; VI-NEXT: s_mov_b32 s19, s8 -; VI-NEXT: s_mov_b32 s21, s10 -; VI-NEXT: s_mov_b32 s23, s12 -; VI-NEXT: s_mov_b32 s25, s40 -; VI-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[2:3] -; VI-NEXT: s_lshr_b64 s[36:37], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[50:51], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[52:53], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s11, s62, 24 -; VI-NEXT: s_lshr_b32 s13, s62, 16 -; VI-NEXT: s_lshr_b32 s14, s62, 8 -; VI-NEXT: s_lshr_b32 s15, s4, 16 -; VI-NEXT: s_lshr_b32 s17, s4, 8 -; VI-NEXT: s_lshr_b32 s19, s58, 24 -; VI-NEXT: s_lshr_b32 s21, s58, 16 -; VI-NEXT: s_lshr_b32 s23, s58, 8 -; VI-NEXT: s_lshr_b32 s25, s28, 16 -; VI-NEXT: s_lshr_b32 s27, s28, 8 -; VI-NEXT: s_lshr_b32 s29, s44, 24 -; VI-NEXT: s_lshr_b32 s41, s44, 16 -; VI-NEXT: s_lshr_b32 s42, s44, 8 -; VI-NEXT: s_lshr_b32 s43, s26, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 8 -; VI-NEXT: s_lshr_b32 s46, s40, 24 -; VI-NEXT: s_lshr_b32 s47, s40, 16 -; VI-NEXT: s_lshr_b32 s56, s40, 8 -; VI-NEXT: s_lshr_b32 s57, s24, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 8 -; VI-NEXT: s_lshr_b32 s60, s12, 24 -; VI-NEXT: s_lshr_b32 s61, s12, 16 -; VI-NEXT: s_lshr_b32 s63, s12, 8 -; VI-NEXT: s_lshr_b32 s72, s22, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 8 -; VI-NEXT: s_lshr_b32 s75, s10, 24 -; VI-NEXT: s_lshr_b32 s76, s10, 16 -; VI-NEXT: s_lshr_b32 s77, s10, 8 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s20, 8 -; VI-NEXT: s_lshr_b32 s89, s8, 24 -; VI-NEXT: s_lshr_b32 s90, s8, 16 -; VI-NEXT: s_lshr_b32 s91, s8, 8 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s18, 8 -; VI-NEXT: s_lshr_b32 vcc_lo, s6, 24 -; VI-NEXT: s_lshr_b32 vcc_hi, s6, 16 -; VI-NEXT: s_lshr_b32 s35, s6, 8 -; VI-NEXT: s_lshr_b32 s9, s16, 16 -; VI-NEXT: s_lshr_b32 s7, s16, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v22 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v22 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v23 -; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v13 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_branch .LBB91_5 -; VI-NEXT: .LBB91_3: -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr7 -; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr66 -; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; kill: killed $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: v_writelane_b32 v61, s6, 0 -; VI-NEXT: v_writelane_b32 v61, s7, 1 -; VI-NEXT: v_writelane_b32 v61, s8, 2 -; VI-NEXT: v_writelane_b32 v61, s9, 3 -; VI-NEXT: v_writelane_b32 v61, s10, 4 -; VI-NEXT: v_writelane_b32 v61, s11, 5 -; VI-NEXT: v_writelane_b32 v61, s12, 6 -; VI-NEXT: v_writelane_b32 v61, s13, 7 -; VI-NEXT: v_writelane_b32 v61, s40, 8 -; VI-NEXT: v_writelane_b32 v61, s41, 9 -; VI-NEXT: v_writelane_b32 v61, s44, 10 -; VI-NEXT: v_writelane_b32 v61, s45, 11 -; VI-NEXT: v_writelane_b32 v61, s58, 12 -; VI-NEXT: v_writelane_b32 v61, s59, 13 -; VI-NEXT: v_writelane_b32 v61, s62, 14 -; VI-NEXT: v_writelane_b32 v61, s63, 15 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: s_branch .LBB91_2 -; VI-NEXT: .LBB91_4: -; VI-NEXT: v_mov_b32_e32 v28, s50 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v28, s52 -; VI-NEXT: v_readlane_b32 s5, v61, 16 -; VI-NEXT: v_mov_b32_e32 v57, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 17 -; VI-NEXT: v_mov_b32_e32 v59, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 18 -; VI-NEXT: v_mov_b32_e32 v47, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 19 -; VI-NEXT: v_mov_b32_e32 v56, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 20 -; VI-NEXT: v_mov_b32_e32 v58, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 21 -; VI-NEXT: v_mov_b32_e32 v25, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 22 -; VI-NEXT: v_mov_b32_e32 v27, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 23 -; VI-NEXT: v_mov_b32_e32 v38, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v60, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 24 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v13, s31 -; VI-NEXT: v_readlane_b32 s74, v61, 14 -; VI-NEXT: v_readlane_b32 s78, v61, 12 -; VI-NEXT: v_readlane_b32 s30, v61, 10 -; VI-NEXT: v_mov_b32_e32 v24, s5 -; VI-NEXT: v_readlane_b32 s5, v61, 25 -; VI-NEXT: v_mov_b32_e32 v8, s36 -; VI-NEXT: v_mov_b32_e32 v7, s37 -; VI-NEXT: v_mov_b32_e32 v5, s38 -; VI-NEXT: v_mov_b32_e32 v4, s39 -; VI-NEXT: v_mov_b32_e32 v2, s48 -; VI-NEXT: v_mov_b32_e32 v1, s49 -; VI-NEXT: v_readlane_b32 s75, v61, 15 -; VI-NEXT: v_readlane_b32 s79, v61, 13 -; VI-NEXT: v_readlane_b32 s31, v61, 11 -; VI-NEXT: v_readlane_b32 s36, v61, 8 -; VI-NEXT: v_readlane_b32 s38, v61, 6 -; VI-NEXT: v_readlane_b32 s48, v61, 4 -; VI-NEXT: v_readlane_b32 s50, v61, 2 -; VI-NEXT: v_readlane_b32 s52, v61, 0 -; VI-NEXT: v_mov_b32_e32 v12, s13 -; VI-NEXT: v_mov_b32_e32 v15, s11 -; VI-NEXT: v_mov_b32_e32 v3, s87 -; VI-NEXT: v_mov_b32_e32 v6, s43 -; VI-NEXT: v_mov_b32_e32 v9, s42 -; VI-NEXT: v_mov_b32_e32 v33, s9 -; VI-NEXT: v_mov_b32_e32 v34, s7 -; VI-NEXT: v_mov_b32_e32 v18, s86 -; VI-NEXT: v_mov_b32_e32 v21, s85 -; VI-NEXT: v_mov_b32_e32 v32, s84 -; VI-NEXT: v_mov_b32_e32 v37, s73 -; VI-NEXT: v_mov_b32_e32 v51, s72 -; VI-NEXT: v_mov_b32_e32 v52, s61 -; VI-NEXT: v_mov_b32_e32 v48, s60 -; VI-NEXT: v_mov_b32_e32 v49, s15 -; VI-NEXT: v_mov_b32_e32 v50, s14 -; VI-NEXT: v_mov_b32_e32 v40, s83 -; VI-NEXT: v_mov_b32_e32 v41, s82 -; VI-NEXT: v_mov_b32_e32 v53, s63 -; VI-NEXT: v_mov_b32_e32 v54, s57 -; VI-NEXT: v_mov_b32_e32 v55, s56 -; VI-NEXT: v_mov_b32_e32 v45, s59 -; VI-NEXT: v_mov_b32_e32 v46, s45 -; VI-NEXT: v_mov_b32_e32 v42, s41 -; VI-NEXT: v_mov_b32_e32 v43, s47 -; VI-NEXT: v_mov_b32_e32 v44, s46 -; VI-NEXT: v_mov_b32_e32 v26, s5 -; VI-NEXT: v_mov_b32_e32 v23, s76 -; VI-NEXT: v_mov_b32_e32 v22, s77 -; VI-NEXT: v_mov_b32_e32 v20, s88 -; VI-NEXT: v_mov_b32_e32 v19, s89 -; VI-NEXT: v_mov_b32_e32 v17, s90 -; VI-NEXT: v_mov_b32_e32 v16, s91 -; VI-NEXT: v_mov_b32_e32 v11, s34 -; VI-NEXT: v_mov_b32_e32 v10, s35 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v28, s54 -; VI-NEXT: v_mov_b32_e32 v30, s70 -; VI-NEXT: v_mov_b32_e32 v31, s80 -; VI-NEXT: v_readlane_b32 s11, v61, 26 -; VI-NEXT: v_readlane_b32 s13, v61, 27 -; VI-NEXT: v_readlane_b32 s14, v61, 28 -; VI-NEXT: v_readlane_b32 s15, v61, 29 -; VI-NEXT: v_readlane_b32 s17, v61, 30 -; VI-NEXT: v_readlane_b32 s19, v61, 31 -; VI-NEXT: v_readlane_b32 s21, v61, 32 -; VI-NEXT: v_readlane_b32 s23, v61, 33 -; VI-NEXT: v_readlane_b32 s25, v61, 34 -; VI-NEXT: v_readlane_b32 s27, v61, 35 -; VI-NEXT: v_readlane_b32 s29, v61, 36 -; VI-NEXT: v_readlane_b32 s41, v61, 37 -; VI-NEXT: v_readlane_b32 s42, v61, 38 -; VI-NEXT: v_readlane_b32 s43, v61, 39 -; VI-NEXT: v_readlane_b32 s45, v61, 40 -; VI-NEXT: v_readlane_b32 s46, v61, 41 -; VI-NEXT: v_readlane_b32 s47, v61, 42 -; VI-NEXT: v_readlane_b32 s56, v61, 43 -; VI-NEXT: v_readlane_b32 s57, v61, 44 -; VI-NEXT: v_readlane_b32 s59, v61, 45 -; VI-NEXT: v_readlane_b32 s60, v61, 46 -; VI-NEXT: v_readlane_b32 s61, v61, 47 -; VI-NEXT: v_readlane_b32 s63, v61, 48 -; VI-NEXT: v_readlane_b32 s72, v61, 49 -; VI-NEXT: v_readlane_b32 s73, v61, 50 -; VI-NEXT: v_readlane_b32 s75, v61, 51 -; VI-NEXT: v_readlane_b32 s76, v61, 52 -; VI-NEXT: v_readlane_b32 s77, v61, 53 -; VI-NEXT: v_readlane_b32 s79, v61, 54 -; VI-NEXT: v_readlane_b32 s88, v61, 55 -; VI-NEXT: v_readlane_b32 s89, v61, 56 -; VI-NEXT: v_readlane_b32 s90, v61, 57 -; VI-NEXT: v_readlane_b32 s91, v61, 58 -; VI-NEXT: v_readlane_b32 s31, v61, 59 -; VI-NEXT: v_readlane_b32 s34, v61, 60 -; VI-NEXT: v_readlane_b32 s37, v61, 9 -; VI-NEXT: v_readlane_b32 vcc_lo, v61, 61 -; VI-NEXT: v_readlane_b32 vcc_hi, v61, 62 -; VI-NEXT: v_readlane_b32 s35, v61, 63 -; VI-NEXT: v_readlane_b32 s9, v62, 0 -; VI-NEXT: v_readlane_b32 s7, v62, 1 -; VI-NEXT: v_readlane_b32 s39, v61, 7 -; VI-NEXT: v_readlane_b32 s49, v61, 5 -; VI-NEXT: v_readlane_b32 s51, v61, 3 -; VI-NEXT: v_readlane_b32 s53, v61, 1 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v28, s64 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v28, s66 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v28, s68 -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: s_and_b32 s5, s16, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_cselect_b32 s4, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s45, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s79, s4, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[4:5], s[78:79], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[78:79], vcc, exec +; VI-NEXT: s_cselect_b32 s78, s47, s57 +; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s45, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: s_bfe_u32 s47, s45, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s45 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s45, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[88:89], vcc, exec +; VI-NEXT: s_cselect_b32 s45, s45, s47 +; VI-NEXT: s_lshr_b32 s79, s45, 16 +; VI-NEXT: s_lshl_b32 s45, s44, 16 +; VI-NEXT: v_add_f32_e32 v2, s45, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: s_bfe_u32 s47, s45, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s45 +; VI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s45, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[88:89], vcc, exec +; VI-NEXT: s_cselect_b32 s88, s45, s47 +; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s44, v1 +; VI-NEXT: v_readfirstlane_b32 s44, v2 +; VI-NEXT: s_bfe_u32 s45, s44, 0x10010 +; VI-NEXT: s_add_i32 s45, s45, s44 +; VI-NEXT: s_add_i32 s47, s45, 0x7fff +; VI-NEXT: s_or_b32 s57, s44, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s44, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s43, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s89, s44, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[44:45], s[88:89], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[88:89], vcc, exec +; VI-NEXT: s_cselect_b32 s88, s47, s57 +; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s43, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: s_bfe_u32 s47, s43, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s43 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s43, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s43, s43, s47 +; VI-NEXT: s_lshr_b32 s89, s43, 16 +; VI-NEXT: s_lshl_b32 s43, s42, 16 +; VI-NEXT: v_add_f32_e32 v2, s43, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: s_bfe_u32 s47, s43, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s43 +; VI-NEXT: s_lshr_b64 s[88:89], s[88:89], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s43, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s43, s47 +; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s42, v1 +; VI-NEXT: v_readfirstlane_b32 s42, v2 +; VI-NEXT: s_bfe_u32 s43, s42, 0x10010 +; VI-NEXT: s_add_i32 s43, s43, s42 +; VI-NEXT: s_add_i32 s47, s43, 0x7fff +; VI-NEXT: s_or_b32 s57, s42, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[42:43], s[90:91], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 +; VI-NEXT: s_and_b32 s29, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s29, v1 +; VI-NEXT: v_readfirstlane_b32 s29, v2 +; VI-NEXT: s_bfe_u32 s47, s29, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s29 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s29, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s29, s29, s47 +; VI-NEXT: s_lshr_b32 s91, s29, 16 +; VI-NEXT: s_lshl_b32 s29, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s29, v1 +; VI-NEXT: v_readfirstlane_b32 s29, v2 +; VI-NEXT: s_bfe_u32 s47, s29, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s29 +; VI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s29, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s30, s29, s47 +; VI-NEXT: s_and_b32 s28, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s28, v1 +; VI-NEXT: v_readfirstlane_b32 s28, v2 +; VI-NEXT: s_bfe_u32 s29, s28, 0x10010 +; VI-NEXT: s_add_i32 s29, s29, s28 +; VI-NEXT: s_add_i32 s47, s29, 0x7fff +; VI-NEXT: s_or_b32 s57, s28, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s31, s28, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[28:29], s[30:31], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s30, s47, s57 +; VI-NEXT: s_and_b32 s27, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s27, v1 +; VI-NEXT: v_readfirstlane_b32 s27, v2 +; VI-NEXT: s_bfe_u32 s47, s27, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s27 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s27, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s27, s27, s47 +; VI-NEXT: s_lshr_b32 s31, s27, 16 +; VI-NEXT: s_lshl_b32 s27, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s27, v1 +; VI-NEXT: v_readfirstlane_b32 s27, v2 +; VI-NEXT: s_bfe_u32 s47, s27, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s27 +; VI-NEXT: s_lshr_b64 s[30:31], s[30:31], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s27, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s27, s47 +; VI-NEXT: s_and_b32 s26, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s26, v1 +; VI-NEXT: v_readfirstlane_b32 s26, v2 +; VI-NEXT: s_bfe_u32 s27, s26, 0x10010 +; VI-NEXT: s_add_i32 s27, s27, s26 +; VI-NEXT: s_add_i32 s47, s27, 0x7fff +; VI-NEXT: s_or_b32 s57, s26, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s35, s26, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[26:27], s[34:35], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b32 s25, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s25, v1 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: s_bfe_u32 s47, s25, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s25 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s25, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s25, s25, s47 +; VI-NEXT: s_lshr_b32 s35, s25, 16 +; VI-NEXT: s_lshl_b32 s25, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s25, v1 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: s_bfe_u32 s47, s25, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s25 +; VI-NEXT: s_lshr_b64 s[36:37], s[34:35], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s25, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s25, s47 +; VI-NEXT: s_and_b32 s24, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s24, v1 +; VI-NEXT: v_readfirstlane_b32 s24, v2 +; VI-NEXT: s_bfe_u32 s25, s24, 0x10010 +; VI-NEXT: s_add_i32 s25, s25, s24 +; VI-NEXT: s_add_i32 s47, s25, 0x7fff +; VI-NEXT: s_or_b32 s57, s24, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s24, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s35, s24, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[24:25], s[34:35], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b32 s23, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s23, v1 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: s_bfe_u32 s47, s23, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s23 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s23, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s23, s23, s47 +; VI-NEXT: s_lshr_b32 s35, s23, 16 +; VI-NEXT: s_lshl_b32 s23, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s23, v1 +; VI-NEXT: v_readfirstlane_b32 s23, v2 +; VI-NEXT: s_bfe_u32 s47, s23, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s23 +; VI-NEXT: s_lshr_b64 s[48:49], s[34:35], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s23, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s23, s47 +; VI-NEXT: s_and_b32 s22, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s22, v1 +; VI-NEXT: v_readfirstlane_b32 s22, v2 +; VI-NEXT: s_bfe_u32 s23, s22, 0x10010 +; VI-NEXT: s_add_i32 s23, s23, s22 +; VI-NEXT: s_add_i32 s47, s23, 0x7fff +; VI-NEXT: s_or_b32 s57, s22, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s22, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[22:23], s[34:35], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b32 s21, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s47, s21, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s21 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s21, s21, s47 +; VI-NEXT: s_lshr_b32 s35, s21, 16 +; VI-NEXT: s_lshl_b32 s21, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s47, s21, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s21 +; VI-NEXT: s_lshr_b64 s[52:53], s[34:35], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s21, s47 +; VI-NEXT: s_and_b32 s20, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s20, v1 +; VI-NEXT: v_readfirstlane_b32 s20, v2 +; VI-NEXT: s_bfe_u32 s21, s20, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s20 +; VI-NEXT: s_add_i32 s47, s21, 0x7fff +; VI-NEXT: s_or_b32 s57, s20, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[20:21], vcc, exec +; VI-NEXT: s_cselect_b32 s20, s57, s47 +; VI-NEXT: s_lshl_b32 s47, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s47, v1 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_add_i32 s57, s57, s47 +; VI-NEXT: s_lshr_b64 s[20:21], s[34:35], 16 +; VI-NEXT: s_addk_i32 s57, 0x7fff +; VI-NEXT: s_bitset1_b32 s47, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b32 s19, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s47, s19, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s19 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s19, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s19, s19, s47 +; VI-NEXT: s_lshr_b32 s35, s19, 16 +; VI-NEXT: s_lshl_b32 s19, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s47, s19, 0x10010 +; VI-NEXT: s_add_i32 s47, s47, s19 +; VI-NEXT: s_lshr_b64 s[64:65], s[34:35], 16 +; VI-NEXT: s_addk_i32 s47, 0x7fff +; VI-NEXT: s_bitset1_b32 s19, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 vcc, vcc, exec +; VI-NEXT: s_cselect_b32 s34, s19, s47 +; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s18, v1 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: s_bfe_u32 s19, s18, 0x10010 +; VI-NEXT: s_add_i32 s19, s19, s18 +; VI-NEXT: s_add_i32 s47, s19, 0x7fff +; VI-NEXT: s_or_b32 s57, s18, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[18:19], vcc, exec +; VI-NEXT: s_cselect_b32 s18, s57, s47 +; VI-NEXT: s_lshr_b32 s47, s64, 24 +; VI-NEXT: s_lshr_b32 s35, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 10 +; VI-NEXT: s_lshr_b32 s47, s64, 16 +; VI-NEXT: s_lshr_b64 s[18:19], s[34:35], 16 +; VI-NEXT: v_writelane_b32 v22, s47, 11 +; VI-NEXT: s_lshr_b32 s47, s64, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 12 +; VI-NEXT: s_lshr_b32 s47, s18, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 13 +; VI-NEXT: s_lshr_b32 s47, s18, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 14 +; VI-NEXT: s_lshr_b32 s47, s52, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 15 +; VI-NEXT: s_lshr_b32 s47, s52, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 16 +; VI-NEXT: s_lshr_b32 s47, s52, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 17 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 18 +; VI-NEXT: s_lshr_b32 s47, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 19 +; VI-NEXT: s_lshr_b32 s47, s48, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 20 +; VI-NEXT: s_lshr_b32 s47, s48, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 21 +; VI-NEXT: s_lshr_b32 s47, s48, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 22 +; VI-NEXT: s_lshr_b32 s47, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 23 +; VI-NEXT: s_lshr_b32 s47, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 24 +; VI-NEXT: s_lshr_b32 s47, s36, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 25 +; VI-NEXT: s_lshr_b32 s47, s36, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 26 +; VI-NEXT: s_lshr_b32 s47, s36, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 27 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 28 +; VI-NEXT: s_lshr_b32 s47, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 29 +; VI-NEXT: s_lshr_b32 s47, s30, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 30 +; VI-NEXT: s_lshr_b32 s47, s30, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 31 +; VI-NEXT: s_lshr_b32 s47, s30, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 32 +; VI-NEXT: s_lshr_b32 s47, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 33 +; VI-NEXT: s_lshr_b32 s47, s26, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 34 +; VI-NEXT: s_lshr_b32 s47, s90, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 35 +; VI-NEXT: s_lshr_b32 s47, s90, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 36 +; VI-NEXT: s_lshr_b32 s47, s90, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 37 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 38 +; VI-NEXT: s_lshr_b32 s47, s28, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 39 +; VI-NEXT: s_lshr_b32 s59, s76, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 40 +; VI-NEXT: s_lshr_b32 s59, s76, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 41 +; VI-NEXT: s_lshr_b32 s59, s76, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 42 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 43 +; VI-NEXT: s_lshr_b32 s59, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 44 +; VI-NEXT: s_lshr_b32 s59, s74, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 45 +; VI-NEXT: s_lshr_b32 s59, s74, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 46 +; VI-NEXT: s_lshr_b32 s59, s74, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 47 +; VI-NEXT: s_lshr_b32 s59, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 48 +; VI-NEXT: s_lshr_b32 s59, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 49 +; VI-NEXT: s_lshr_b32 s59, s72, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 50 +; VI-NEXT: s_lshr_b32 s59, s72, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 51 +; VI-NEXT: s_lshr_b32 s59, s72, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 52 +; VI-NEXT: s_lshr_b32 s59, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 53 +; VI-NEXT: s_lshr_b32 s59, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 54 +; VI-NEXT: s_lshr_b32 s59, s62, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 55 +; VI-NEXT: s_lshr_b32 s59, s62, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 56 +; VI-NEXT: s_lshr_b32 s59, s62, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 57 +; VI-NEXT: s_lshr_b32 s59, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 58 +; VI-NEXT: s_lshr_b32 s59, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 59 +; VI-NEXT: s_lshr_b32 s59, s60, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 60 +; VI-NEXT: s_lshr_b32 s59, s60, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 61 +; VI-NEXT: s_lshr_b32 s59, s60, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 62 +; VI-NEXT: s_lshr_b32 s59, s12, 16 +; VI-NEXT: s_mov_b32 s5, s76 +; VI-NEXT: v_writelane_b32 v22, s59, 63 +; VI-NEXT: s_lshr_b32 s59, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 0 +; VI-NEXT: s_lshr_b32 s59, s58, 24 +; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; VI-NEXT: s_mov_b32 s7, s74 +; VI-NEXT: v_writelane_b32 v21, s59, 1 +; VI-NEXT: s_lshr_b32 s59, s58, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 2 +; VI-NEXT: s_lshr_b32 s59, s58, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[6:7], 24 +; VI-NEXT: s_mov_b32 s9, s72 +; VI-NEXT: v_writelane_b32 v21, s59, 3 +; VI-NEXT: s_lshr_b32 s59, s14, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v21, s59, 4 +; VI-NEXT: s_lshr_b32 s59, s14, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; VI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; VI-NEXT: s_mov_b32 s11, s62 +; VI-NEXT: v_writelane_b32 v21, s59, 5 +; VI-NEXT: s_lshr_b32 s59, s56, 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v21, s59, 6 +; VI-NEXT: s_lshr_b32 s59, s56, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; VI-NEXT: s_mov_b32 s13, s60 +; VI-NEXT: v_writelane_b32 v21, s59, 7 +; VI-NEXT: s_lshr_b32 s59, s56, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 2 +; VI-NEXT: s_mov_b32 s41, s46 +; VI-NEXT: s_mov_b32 s17, s56 +; VI-NEXT: s_mov_b32 s15, s58 +; VI-NEXT: s_mov_b32 s45, s78 +; VI-NEXT: s_mov_b32 s43, s88 +; VI-NEXT: s_mov_b32 s29, s90 +; VI-NEXT: s_mov_b32 s27, s30 +; VI-NEXT: s_mov_b32 s25, s36 +; VI-NEXT: s_mov_b32 s23, s48 +; VI-NEXT: s_mov_b32 s21, s52 +; VI-NEXT: s_mov_b32 s19, s64 +; VI-NEXT: v_writelane_b32 v21, s59, 8 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; VI-NEXT: s_lshr_b32 s47, s88, 24 +; VI-NEXT: s_lshr_b32 s57, s88, 16 +; VI-NEXT: s_lshr_b32 s61, s88, 8 +; VI-NEXT: s_lshr_b32 s75, s42, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 8 +; VI-NEXT: s_lshr_b32 s89, s78, 24 +; VI-NEXT: s_lshr_b32 s91, s78, 16 +; VI-NEXT: s_lshr_b32 s31, s78, 8 +; VI-NEXT: s_lshr_b32 s37, s44, 16 +; VI-NEXT: s_lshr_b32 s49, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 9 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s63, s46, 24 +; VI-NEXT: s_lshr_b32 s73, s46, 16 +; VI-NEXT: s_lshr_b32 s77, s46, 8 +; VI-NEXT: s_lshr_b32 s53, s40, 16 +; VI-NEXT: s_lshr_b32 s65, s40, 8 +; VI-NEXT: s_lshr_b64 s[80:81], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[82:83], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[86:87], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 0 +; VI-NEXT: s_lshr_b64 s[68:69], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[70:71], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[84:85], s[40:41], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 1 +; VI-NEXT: .LBB91_3: ; %end +; VI-NEXT: s_and_b32 s5, s44, 0xff +; VI-NEXT: s_lshl_b32 s7, s49, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s66, 8 +; VI-NEXT: s_and_b32 s9, s37, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s78, 0xff +; VI-NEXT: s_lshl_b32 s7, s31, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s9, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_and_b32 s7, s91, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_mov_b32_e32 v28, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xff -; VI-NEXT: s_lshl_b32 s6, s35, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, vcc_hi, 0xff -; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s54, 8 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s34, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_and_b32 s5, s88, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s9, s47, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 39 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 38 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_lshl_b32 s7, s50, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v28, vcc, 4, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s8, 0xff -; VI-NEXT: s_lshl_b32 s6, s91, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s90, 0xff -; VI-NEXT: s_lshl_b32 s7, s89, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 37 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_and_b32 s5, s90, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 36 +; VI-NEXT: v_readlane_b32 s9, v22, 35 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 8, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s88, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s79, 0xff -; VI-NEXT: s_lshl_b32 s7, s48, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 34 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 33 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s38, 8 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 12, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s10, 0xff -; VI-NEXT: s_lshl_b32 s6, s77, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s76, 0xff -; VI-NEXT: s_lshl_b32 s7, s75, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 32 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: s_and_b32 s5, s30, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 31 +; VI-NEXT: v_readlane_b32 s9, v22, 30 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 16, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 29 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 28 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s34, 8 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 27 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_and_b32 s5, s36, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 26 +; VI-NEXT: v_readlane_b32 s9, v22, 25 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 24 +; VI-NEXT: v_mov_b32_e32 v10, s5 ; VI-NEXT: s_and_b32 s5, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s73, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s72, 0xff -; VI-NEXT: s_lshl_b32 s7, s38, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 23 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s86, 8 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s12, 0xff -; VI-NEXT: s_lshl_b32 s6, s63, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s61, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 22 +; VI-NEXT: v_mov_b32_e32 v11, s5 +; VI-NEXT: s_and_b32 s5, s48, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 21 +; VI-NEXT: v_readlane_b32 s9, v22, 20 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 24, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s24, 0xff -; VI-NEXT: s_lshl_b32 s6, s59, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s57, 0xff -; VI-NEXT: s_lshl_b32 s7, s36, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 19 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 18 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s82, 8 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 28, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 17 +; VI-NEXT: v_mov_b32_e32 v13, s5 +; VI-NEXT: s_and_b32 s5, s52, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 16 +; VI-NEXT: v_readlane_b32 s9, v22, 15 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 14 +; VI-NEXT: v_mov_b32_e32 v14, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: v_readlane_b32 s9, v22, 13 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s80, 8 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 12 +; VI-NEXT: v_mov_b32_e32 v15, s5 +; VI-NEXT: s_and_b32 s5, s64, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 11 +; VI-NEXT: v_readlane_b32 s9, v22, 10 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v16, s5 ; VI-NEXT: s_and_b32 s5, s40, 0xff -; VI-NEXT: s_lshl_b32 s6, s56, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s47, 0xff -; VI-NEXT: s_lshl_b32 s7, s46, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s84, 8 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 32, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s26, 0xff -; VI-NEXT: s_lshl_b32 s6, s45, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s43, 0xff -; VI-NEXT: s_lshl_b32 s7, s30, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: v_mov_b32_e32 v17, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s7, s77, 8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s63, 8 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 36, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s44, 0xff -; VI-NEXT: s_lshl_b32 s6, s42, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s41, 0xff -; VI-NEXT: s_lshl_b32 s7, s29, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: v_mov_b32_e32 v18, s5 +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s59, 8 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: v_readlane_b32 s7, v21, 9 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s70, 8 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 40, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s56, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 7 +; VI-NEXT: v_readlane_b32 s9, v21, 6 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 4 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s68, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 3 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s58, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 2 +; VI-NEXT: v_readlane_b32 s9, v21, 1 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 0 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 63 +; VI-NEXT: v_readlane_b32 s12, v22, 0 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 62 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s60, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 61 +; VI-NEXT: v_readlane_b32 s9, v22, 60 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 59 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 58 +; VI-NEXT: v_readlane_b32 s10, v22, 2 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 57 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s62, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 56 +; VI-NEXT: v_readlane_b32 s9, v22, 55 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 54 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 53 +; VI-NEXT: v_readlane_b32 s8, v22, 4 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 52 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s72, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 51 +; VI-NEXT: v_readlane_b32 s8, v22, 50 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: v_readlane_b32 s6, v22, 49 +; VI-NEXT: v_readlane_b32 s9, v22, 5 +; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s25, 0xff -; VI-NEXT: s_lshl_b32 s7, s78, 8 +; VI-NEXT: v_readlane_b32 s6, v22, 48 +; VI-NEXT: v_readlane_b32 s8, v22, 6 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 44, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 -; VI-NEXT: s_and_b32 s5, s58, 0xff -; VI-NEXT: s_lshl_b32 s6, s23, 8 +; VI-NEXT: v_readlane_b32 s6, v22, 47 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s74, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s21, 0xff -; VI-NEXT: s_lshl_b32 s7, s19, 8 +; VI-NEXT: v_readlane_b32 s6, v22, 46 +; VI-NEXT: v_readlane_b32 s7, v22, 45 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 48, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_readlane_b32 s5, v22, 44 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s15, 0xff -; VI-NEXT: s_lshl_b32 s6, s74, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 43 +; VI-NEXT: v_readlane_b32 s6, v22, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_add_u32_e32 v28, vcc, 52, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v29, s4 -; VI-NEXT: s_and_b32 s4, s62, 0xff -; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 42 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s76, 0xff +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s13, 0xff -; VI-NEXT: s_lshl_b32 s6, s11, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 41 +; VI-NEXT: v_readlane_b32 s6, v22, 40 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v31 -; VI-NEXT: v_add_u32_e32 v28, vcc, 56, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v28, vcc, 60, v0 -; VI-NEXT: v_mov_b32_e32 v29, s4 -; VI-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v25, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen -; VI-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 -; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s87, v63, 31 -; VI-NEXT: v_readlane_b32 s86, v63, 30 -; VI-NEXT: v_readlane_b32 s85, v63, 29 -; VI-NEXT: v_readlane_b32 s84, v63, 28 -; VI-NEXT: v_readlane_b32 s83, v63, 27 -; VI-NEXT: v_readlane_b32 s82, v63, 26 -; VI-NEXT: v_readlane_b32 s81, v63, 25 -; VI-NEXT: v_readlane_b32 s80, v63, 24 -; VI-NEXT: v_readlane_b32 s71, v63, 23 -; VI-NEXT: v_readlane_b32 s70, v63, 22 -; VI-NEXT: v_readlane_b32 s69, v63, 21 -; VI-NEXT: v_readlane_b32 s68, v63, 20 -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s13, v22, 1 +; VI-NEXT: v_readlane_b32 s11, v22, 3 +; VI-NEXT: v_readlane_b32 s9, v22, 7 +; VI-NEXT: v_readlane_b32 s7, v22, 9 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_4: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s60, 0 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s61, 1 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s62, 2 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s63, 3 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s72, 4 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s73, 5 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s74, 6 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s75, 7 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s76, 8 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: v_writelane_b32 v22, s77, 9 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB91_2 ; ; GFX9-LABEL: bitcast_v64bf16_to_v128i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v63, s30, 0 -; GFX9-NEXT: v_writelane_b32 v63, s31, 1 -; GFX9-NEXT: v_writelane_b32 v63, s34, 2 -; GFX9-NEXT: v_writelane_b32 v63, s35, 3 -; GFX9-NEXT: v_writelane_b32 v63, s36, 4 -; GFX9-NEXT: v_writelane_b32 v63, s37, 5 -; GFX9-NEXT: v_writelane_b32 v63, s38, 6 -; GFX9-NEXT: v_writelane_b32 v63, s39, 7 -; GFX9-NEXT: v_writelane_b32 v63, s48, 8 -; GFX9-NEXT: v_writelane_b32 v63, s49, 9 -; GFX9-NEXT: v_writelane_b32 v63, s50, 10 -; GFX9-NEXT: v_writelane_b32 v63, s51, 11 -; GFX9-NEXT: v_writelane_b32 v63, s52, 12 -; GFX9-NEXT: v_writelane_b32 v63, s53, 13 -; GFX9-NEXT: v_writelane_b32 v63, s54, 14 -; GFX9-NEXT: v_writelane_b32 v63, s55, 15 -; GFX9-NEXT: v_writelane_b32 v63, s64, 16 -; GFX9-NEXT: v_writelane_b32 v63, s65, 17 -; GFX9-NEXT: v_writelane_b32 v63, s66, 18 -; GFX9-NEXT: v_writelane_b32 v63, s67, 19 -; GFX9-NEXT: v_writelane_b32 v63, s68, 20 -; GFX9-NEXT: v_writelane_b32 v63, s69, 21 -; GFX9-NEXT: v_writelane_b32 v63, s70, 22 -; GFX9-NEXT: v_writelane_b32 v63, s71, 23 -; GFX9-NEXT: v_writelane_b32 v63, s80, 24 -; GFX9-NEXT: v_writelane_b32 v63, s81, 25 -; GFX9-NEXT: v_writelane_b32 v63, s82, 26 -; GFX9-NEXT: v_writelane_b32 v63, s83, 27 -; GFX9-NEXT: v_writelane_b32 v63, s84, 28 -; GFX9-NEXT: v_writelane_b32 v63, s85, 29 -; GFX9-NEXT: v_writelane_b32 v63, s86, 30 -; GFX9-NEXT: v_writelane_b32 v63, s87, 31 -; GFX9-NEXT: v_writelane_b32 v63, s96, 32 -; GFX9-NEXT: v_writelane_b32 v63, s97, 33 -; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v20, s34, 2 +; GFX9-NEXT: v_writelane_b32 v20, s35, 3 +; GFX9-NEXT: v_writelane_b32 v20, s36, 4 +; GFX9-NEXT: v_writelane_b32 v20, s37, 5 +; GFX9-NEXT: v_writelane_b32 v20, s38, 6 +; GFX9-NEXT: v_writelane_b32 v20, s39, 7 +; GFX9-NEXT: v_writelane_b32 v20, s48, 8 +; GFX9-NEXT: v_writelane_b32 v20, s49, 9 +; GFX9-NEXT: v_writelane_b32 v20, s50, 10 +; GFX9-NEXT: v_writelane_b32 v20, s51, 11 +; GFX9-NEXT: v_writelane_b32 v20, s52, 12 +; GFX9-NEXT: v_writelane_b32 v20, s53, 13 +; GFX9-NEXT: v_writelane_b32 v20, s54, 14 +; GFX9-NEXT: v_writelane_b32 v20, s55, 15 +; GFX9-NEXT: v_writelane_b32 v20, s64, 16 +; GFX9-NEXT: v_writelane_b32 v20, s65, 17 +; GFX9-NEXT: v_writelane_b32 v20, s66, 18 +; GFX9-NEXT: v_writelane_b32 v20, s67, 19 +; GFX9-NEXT: v_writelane_b32 v20, s68, 20 +; GFX9-NEXT: v_writelane_b32 v20, s69, 21 +; GFX9-NEXT: v_writelane_b32 v20, s70, 22 +; GFX9-NEXT: v_writelane_b32 v20, s71, 23 +; GFX9-NEXT: v_writelane_b32 v20, s80, 24 +; GFX9-NEXT: v_writelane_b32 v20, s81, 25 +; GFX9-NEXT: v_writelane_b32 v20, s82, 26 +; GFX9-NEXT: v_writelane_b32 v20, s83, 27 +; GFX9-NEXT: v_writelane_b32 v20, s84, 28 +; GFX9-NEXT: v_writelane_b32 v20, s85, 29 +; GFX9-NEXT: v_writelane_b32 v20, s86, 30 +; GFX9-NEXT: v_writelane_b32 v20, s87, 31 +; GFX9-NEXT: v_writelane_b32 v20, s96, 32 +; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_readfirstlane_b32 s40, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_readfirstlane_b32 s41, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, s18 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s20 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, s21 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s26 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_writelane_b32 v63, s99, 35 -; GFX9-NEXT: v_readfirstlane_b32 s76, v3 -; GFX9-NEXT: v_readfirstlane_b32 s77, v4 -; GFX9-NEXT: v_readfirstlane_b32 s74, v5 -; GFX9-NEXT: v_readfirstlane_b32 s75, v6 -; GFX9-NEXT: v_readfirstlane_b32 s72, v7 -; GFX9-NEXT: v_readfirstlane_b32 s73, v8 -; GFX9-NEXT: v_readfirstlane_b32 s62, v9 -; GFX9-NEXT: v_readfirstlane_b32 s63, v10 -; GFX9-NEXT: v_readfirstlane_b32 s60, v11 -; GFX9-NEXT: v_readfirstlane_b32 s61, v12 -; GFX9-NEXT: v_readfirstlane_b32 s58, v13 -; GFX9-NEXT: v_readfirstlane_b32 s59, v14 -; GFX9-NEXT: v_readfirstlane_b32 s56, v15 -; GFX9-NEXT: v_readfirstlane_b32 s57, v16 -; GFX9-NEXT: v_readfirstlane_b32 s46, v17 -; GFX9-NEXT: v_readfirstlane_b32 s47, v18 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: v_writelane_b32 v20, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_readfirstlane_b32 s44, v3 +; GFX9-NEXT: v_readfirstlane_b32 s45, v4 +; GFX9-NEXT: v_readfirstlane_b32 s42, v5 +; GFX9-NEXT: v_readfirstlane_b32 s43, v6 +; GFX9-NEXT: v_readfirstlane_b32 s28, v7 +; GFX9-NEXT: v_readfirstlane_b32 s29, v8 +; GFX9-NEXT: v_readfirstlane_b32 s26, v9 +; GFX9-NEXT: v_readfirstlane_b32 s27, v10 +; GFX9-NEXT: v_readfirstlane_b32 s24, v11 +; GFX9-NEXT: v_readfirstlane_b32 s25, v12 +; GFX9-NEXT: v_readfirstlane_b32 s22, v13 +; GFX9-NEXT: v_readfirstlane_b32 s23, v14 +; GFX9-NEXT: v_readfirstlane_b32 s20, v15 +; GFX9-NEXT: v_readfirstlane_b32 s21, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v1 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s19, v2 +; GFX9-NEXT: v_writelane_b32 v20, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB91_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s6, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 18 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 17 -; GFX9-NEXT: s_lshr_b32 s6, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 19 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 20 -; GFX9-NEXT: s_lshr_b32 s6, s4, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 21 -; GFX9-NEXT: s_lshr_b32 s6, s29, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 22 -; GFX9-NEXT: s_lshr_b32 s6, s29, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 16 -; GFX9-NEXT: s_lshr_b32 s6, s29, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 23 -; GFX9-NEXT: s_lshr_b32 s6, s28, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 24 -; GFX9-NEXT: s_lshr_b32 s6, s28, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 25 -; GFX9-NEXT: s_lshr_b32 s6, s27, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 26 -; GFX9-NEXT: s_lshr_b32 s6, s27, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 15 -; GFX9-NEXT: s_lshr_b32 s6, s27, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 27 -; GFX9-NEXT: s_lshr_b32 s6, s26, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 28 -; GFX9-NEXT: s_lshr_b32 s6, s26, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 29 -; GFX9-NEXT: s_lshr_b32 s6, s25, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 30 -; GFX9-NEXT: s_lshr_b32 s6, s25, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 14 -; GFX9-NEXT: s_lshr_b32 s6, s25, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 31 -; GFX9-NEXT: s_lshr_b32 s6, s24, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 32 -; GFX9-NEXT: s_lshr_b32 s6, s24, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 33 -; GFX9-NEXT: s_lshr_b32 s6, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 34 -; GFX9-NEXT: s_lshr_b32 s6, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 13 -; GFX9-NEXT: s_lshr_b32 s6, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 35 -; GFX9-NEXT: s_lshr_b32 s6, s22, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 36 -; GFX9-NEXT: s_lshr_b32 s6, s22, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 37 -; GFX9-NEXT: s_lshr_b32 s6, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 38 -; GFX9-NEXT: s_lshr_b32 s6, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 12 -; GFX9-NEXT: s_lshr_b32 s6, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 39 -; GFX9-NEXT: s_lshr_b32 s6, s20, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 40 -; GFX9-NEXT: s_lshr_b32 s6, s20, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 41 -; GFX9-NEXT: s_lshr_b32 s6, s19, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 42 -; GFX9-NEXT: s_lshr_b32 s6, s19, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 11 -; GFX9-NEXT: s_lshr_b32 s6, s19, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 43 -; GFX9-NEXT: s_lshr_b32 s6, s18, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 44 -; GFX9-NEXT: s_lshr_b32 s6, s18, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 45 -; GFX9-NEXT: s_lshr_b32 s6, s17, 24 -; GFX9-NEXT: v_writelane_b32 v62, s6, 46 -; GFX9-NEXT: s_lshr_b32 s6, s17, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 10 -; GFX9-NEXT: s_lshr_b32 s6, s17, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 47 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 -; GFX9-NEXT: v_writelane_b32 v62, s6, 48 -; GFX9-NEXT: s_lshr_b32 s6, s16, 8 -; GFX9-NEXT: v_writelane_b32 v62, s6, 49 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v62, s40, 8 -; GFX9-NEXT: v_writelane_b32 v62, s41, 9 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[28:29], 24 -; GFX9-NEXT: v_writelane_b32 v62, s40, 6 -; GFX9-NEXT: v_writelane_b32 v62, s41, 7 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: v_writelane_b32 v62, s40, 4 -; GFX9-NEXT: v_writelane_b32 v62, s41, 5 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[24:25], 24 -; GFX9-NEXT: v_writelane_b32 v62, s40, 2 -; GFX9-NEXT: v_writelane_b32 v62, s41, 3 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[22:23], 24 -; GFX9-NEXT: v_writelane_b32 v62, s40, 0 -; GFX9-NEXT: s_lshr_b32 s70, s47, 24 -; GFX9-NEXT: s_lshr_b32 s15, s47, 16 -; GFX9-NEXT: s_lshr_b32 s7, s47, 8 -; GFX9-NEXT: s_lshr_b32 s53, s46, 16 -; GFX9-NEXT: s_lshr_b32 s52, s46, 8 -; GFX9-NEXT: s_lshr_b32 s67, s57, 24 -; GFX9-NEXT: s_lshr_b32 s14, s57, 16 -; GFX9-NEXT: s_lshr_b32 s69, s57, 8 -; GFX9-NEXT: s_lshr_b32 s6, s56, 16 -; GFX9-NEXT: s_lshr_b32 s71, s56, 8 -; GFX9-NEXT: s_lshr_b32 s64, s59, 24 -; GFX9-NEXT: s_lshr_b32 s13, s59, 16 -; GFX9-NEXT: s_lshr_b32 s66, s59, 8 -; GFX9-NEXT: s_lshr_b32 s51, s58, 16 -; GFX9-NEXT: s_lshr_b32 s68, s58, 8 -; GFX9-NEXT: s_lshr_b32 s99, s61, 24 -; GFX9-NEXT: s_lshr_b32 s12, s61, 16 -; GFX9-NEXT: s_lshr_b32 s55, s61, 8 -; GFX9-NEXT: s_lshr_b32 s50, s60, 16 -; GFX9-NEXT: s_lshr_b32 s65, s60, 8 -; GFX9-NEXT: s_lshr_b32 s96, s63, 24 -; GFX9-NEXT: s_lshr_b32 s11, s63, 16 -; GFX9-NEXT: s_lshr_b32 s98, s63, 8 -; GFX9-NEXT: s_lshr_b32 s49, s62, 16 -; GFX9-NEXT: s_lshr_b32 s54, s62, 8 -; GFX9-NEXT: s_lshr_b32 s85, s73, 24 -; GFX9-NEXT: s_lshr_b32 s10, s73, 16 -; GFX9-NEXT: s_lshr_b32 s87, s73, 8 -; GFX9-NEXT: s_lshr_b32 s48, s72, 16 -; GFX9-NEXT: s_lshr_b32 s97, s72, 8 -; GFX9-NEXT: s_lshr_b32 s82, s75, 24 -; GFX9-NEXT: s_lshr_b32 s9, s75, 16 -; GFX9-NEXT: s_lshr_b32 s84, s75, 8 -; GFX9-NEXT: s_lshr_b32 s39, s74, 16 -; GFX9-NEXT: s_lshr_b32 s86, s74, 8 -; GFX9-NEXT: s_lshr_b32 s80, s77, 24 -; GFX9-NEXT: s_lshr_b32 s8, s77, 16 -; GFX9-NEXT: s_lshr_b32 s81, s77, 8 -; GFX9-NEXT: s_lshr_b32 s38, s76, 16 -; GFX9-NEXT: s_lshr_b32 s83, s76, 8 -; GFX9-NEXT: v_writelane_b32 v62, s41, 1 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[58:59], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[62:63], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[74:75], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[76:77], 24 -; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s18, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s20, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s22, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s24, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s24, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; GFX9-NEXT: s_lshr_b32 s47, s5, 24 +; GFX9-NEXT: s_lshr_b32 s56, s5, 8 +; GFX9-NEXT: s_lshr_b32 s57, s4, 16 +; GFX9-NEXT: s_lshr_b32 s74, s6, 8 +; GFX9-NEXT: s_lshr_b32 s75, s11, 24 +; GFX9-NEXT: s_lshr_b32 s78, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX9-NEXT: s_mov_b32 s91, s60 +; GFX9-NEXT: s_mov_b32 s95, s61 +; GFX9-NEXT: s_mov_b32 s31, s62 +; GFX9-NEXT: s_mov_b32 s35, s63 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX9-NEXT: s_lshr_b32 s69, s19, 16 +; GFX9-NEXT: s_lshr_b32 s68, s21, 16 +; GFX9-NEXT: s_lshr_b32 s38, s26, 16 +; GFX9-NEXT: s_lshr_b32 s48, s26, 8 +; GFX9-NEXT: s_lshr_b32 s50, s29, 24 +; GFX9-NEXT: s_lshr_b32 s51, s29, 8 +; GFX9-NEXT: s_lshr_b32 s53, s28, 16 +; GFX9-NEXT: s_lshr_b32 s71, s28, 8 +; GFX9-NEXT: s_lshr_b32 s80, s43, 24 +; GFX9-NEXT: s_lshr_b32 s82, s43, 8 +; GFX9-NEXT: s_lshr_b32 s65, s42, 16 +; GFX9-NEXT: s_lshr_b32 s85, s42, 8 +; GFX9-NEXT: s_lshr_b32 s66, s45, 24 +; GFX9-NEXT: s_lshr_b32 s97, s45, 8 +; GFX9-NEXT: s_lshr_b32 s99, s44, 16 +; GFX9-NEXT: s_lshr_b32 s67, s44, 8 +; GFX9-NEXT: s_lshr_b32 s54, s12, 16 +; GFX9-NEXT: s_lshr_b32 s39, s12, 8 +; GFX9-NEXT: s_lshr_b32 s49, s15, 24 +; GFX9-NEXT: s_lshr_b32 s55, s15, 8 +; GFX9-NEXT: s_lshr_b32 s52, s14, 16 +; GFX9-NEXT: s_lshr_b32 s70, s14, 8 +; GFX9-NEXT: s_lshr_b32 s64, s17, 24 +; GFX9-NEXT: s_lshr_b32 s81, s17, 8 +; GFX9-NEXT: s_lshr_b32 s83, s16, 16 +; GFX9-NEXT: s_lshr_b32 s84, s16, 8 +; GFX9-NEXT: s_lshr_b32 s86, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 +; GFX9-NEXT: s_lshr_b32 s87, s41, 8 +; GFX9-NEXT: s_lshr_b32 s96, s40, 16 +; GFX9-NEXT: s_lshr_b32 s98, s40, 8 +; GFX9-NEXT: s_mov_b32 s59, s47 +; GFX9-NEXT: s_mov_b32 s73, s56 +; GFX9-NEXT: s_mov_b32 s77, s57 +; GFX9-NEXT: s_mov_b32 s37, s74 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_mov_b32 s61, s75 +; GFX9-NEXT: s_mov_b32 s63, s78 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[40:41], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB91_3 ; GFX9-NEXT: .LBB91_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s6, s77, 0xffff0000 +; GFX9-NEXT: s_and_b32 s46, s41, 0xffff0000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s77, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s41, s41, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s41, v1 +; GFX9-NEXT: v_readfirstlane_b32 s41, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s41, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s41 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s41, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_and_b32 s6, s76, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s41, s41, s56 +; GFX9-NEXT: s_and_b32 s46, s40, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s41, s41, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s89, s41, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s76, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: s_and_b32 s6, s75, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s40, s40, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s40, v1 +; GFX9-NEXT: v_readfirstlane_b32 s40, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s40, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s40 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s40, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s75, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s40, s40, s57 +; GFX9-NEXT: s_and_b32 s46, s17, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s40, s40, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s88, s40, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s17, v1 +; GFX9-NEXT: v_readfirstlane_b32 s17, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s17, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s17 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s74, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v33 -; GFX9-NEXT: s_and_b32 s6, s73, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s17, s17, s56 +; GFX9-NEXT: s_and_b32 s46, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s17, s17, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s93, s17, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s73, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s16, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s16 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v34 -; GFX9-NEXT: s_and_b32 s6, s72, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v16, v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s16, s16, s57 +; GFX9-NEXT: s_and_b32 s46, s15, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s16, s16, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s92, s16, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s72, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v36 -; GFX9-NEXT: s_and_b32 s6, s63, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v15, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s15, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s15, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s15 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s63, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s15, s15, s56 +; GFX9-NEXT: s_and_b32 s46, s14, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s15, s15, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 2 +; GFX9-NEXT: s_pack_ll_b32_b16 s79, s15, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v37 -; GFX9-NEXT: s_and_b32 s6, s62, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s14, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s14 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s62, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v39 -; GFX9-NEXT: s_and_b32 s6, s61, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v17, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s14, s14, s57 +; GFX9-NEXT: s_and_b32 s46, s13, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s14, s14, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s78, s14, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s61, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s13, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s13, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s13 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v48 -; GFX9-NEXT: s_and_b32 s6, s60, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s13, s13, s56 +; GFX9-NEXT: s_and_b32 s46, s12, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s13, s13, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 3 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s13, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s60, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v50 -; GFX9-NEXT: s_and_b32 s6, s59, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v19, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s12, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s12 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s59, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s12, s12, s57 +; GFX9-NEXT: s_and_b32 s46, s11, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s12, s12, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s12, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v51 -; GFX9-NEXT: s_and_b32 s6, s58, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s11, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s11, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s11 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s58, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v53 -; GFX9-NEXT: s_and_b32 s6, s57, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v21, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s11, s11, s56 +; GFX9-NEXT: s_and_b32 s46, s10, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s11, s11, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s11, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s57, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s10, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s10 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v54 -; GFX9-NEXT: s_and_b32 s6, s56, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s10, s10, s57 +; GFX9-NEXT: s_and_b32 s46, s9, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s10, s10, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s10, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s56, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; GFX9-NEXT: s_and_b32 s6, s47, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v23, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s9, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s9, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s9 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s47, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s9, s9, s56 +; GFX9-NEXT: s_and_b32 s46, s8, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s9, s9, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s9, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v41 -; GFX9-NEXT: s_and_b32 s6, s46, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s8 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s46, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v25, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s8, s57 +; GFX9-NEXT: s_and_b32 s46, s7, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_lshr_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s8, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s17, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s7, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s7 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s17, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s7, s7, s56 +; GFX9-NEXT: s_and_b32 s46, s6, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_lshr_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s58, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s58, s56 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s6 +; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s16, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s58 +; GFX9-NEXT: s_and_b32 s46, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s58, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s59, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s12, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s19, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s59, s58 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_lshr_b32 s59, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s5 +; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s19, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s58 +; GFX9-NEXT: s_and_b32 s46, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s58, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 6 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s46, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s18, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_writelane_b32 v21, s59, 7 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s5, s59 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s46, s72 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: s_bfe_u32 s58, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s4 +; GFX9-NEXT: s_lshr_b32 s46, s46, 16 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s18, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s72 +; GFX9-NEXT: s_and_b32 s58, s45, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s21, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s45, s45, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s45, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s45, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s45 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s45, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s21, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s45, s45, s72 +; GFX9-NEXT: s_and_b32 s58, s44, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s45, s45, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s37, s45, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s20, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s44, s44, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s44, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s44, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s44 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s44, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s20, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s44, s44, s73 +; GFX9-NEXT: s_and_b32 s58, s43, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s44, s44, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s36, s44, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s14, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s23, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s43, s43, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s43, v1 +; GFX9-NEXT: v_readfirstlane_b32 s43, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s43, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s43 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s43, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s23, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s43, s43, s72 +; GFX9-NEXT: s_and_b32 s58, s42, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s43, s43, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 9 +; GFX9-NEXT: s_pack_ll_b32_b16 s35, s43, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s22, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s42, s42, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s42, v1 +; GFX9-NEXT: v_readfirstlane_b32 s42, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s42, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s42 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s42, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s22, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s42, s42, s73 +; GFX9-NEXT: s_and_b32 s58, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s42, s42, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s34, s42, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s15, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s25, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s29, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s29, v1 +; GFX9-NEXT: v_readfirstlane_b32 s29, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s29, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s29 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s29, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s25, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s29, s29, s72 +; GFX9-NEXT: s_and_b32 s58, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s29, s29, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 10 +; GFX9-NEXT: s_pack_ll_b32_b16 s31, s29, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s24, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s28, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s28, v1 +; GFX9-NEXT: v_readfirstlane_b32 s28, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s28, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s28 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s28, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s24, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s28, s28, s73 +; GFX9-NEXT: s_and_b32 s58, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s28, s28, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s28, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s76, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s27, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s27, v1 +; GFX9-NEXT: v_readfirstlane_b32 s27, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s27, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s27 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s27, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s27, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s27, s27, s72 +; GFX9-NEXT: s_and_b32 s58, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s27, s27, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 11 +; GFX9-NEXT: s_pack_ll_b32_b16 s95, s27, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s26, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s26, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s26, v1 +; GFX9-NEXT: v_readfirstlane_b32 s26, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s26, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s26 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s26, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s26, s26, s73 +; GFX9-NEXT: s_and_b32 s58, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s26, s26, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s94, s26, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s29, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s25, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s25, v1 +; GFX9-NEXT: v_readfirstlane_b32 s25, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s25, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s25 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s25, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s29, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s25, s25, s72 +; GFX9-NEXT: s_and_b32 s58, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s25, s25, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 12 +; GFX9-NEXT: s_pack_ll_b32_b16 s91, s25, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s28, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s24, v1 +; GFX9-NEXT: v_readfirstlane_b32 s24, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s24, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s24 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s24, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s28, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s24, s24, s73 +; GFX9-NEXT: s_and_b32 s58, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s24, s24, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s90, s24, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_lshr_b32 s78, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s23, v1 +; GFX9-NEXT: v_readfirstlane_b32 s23, v2 +; GFX9-NEXT: s_lshr_b32 s73, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s23, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s23 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s23, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s8, s8, s6 -; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s6, 22 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s23, s23, s72 +; GFX9-NEXT: s_and_b32 s58, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s23, s23, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: v_writelane_b32 v21, s73, 13 +; GFX9-NEXT: s_pack_ll_b32_b16 s77, s23, s73 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s6, s10 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff -; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s22, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s22, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s22, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s22 +; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s22, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s22, s22, s73 +; GFX9-NEXT: s_and_b32 s58, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_lshr_b32 s22, s22, 16 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s76, s22, s72 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s21, v2 +; GFX9-NEXT: s_lshr_b32 s68, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s21, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s21 +; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s21, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s21, s21, s72 +; GFX9-NEXT: s_and_b32 s58, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_lshr_b32 s21, s21, 16 +; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s38, s58, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s38, s72 +; GFX9-NEXT: s_lshl_b32 s20, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_lshr_b32 s72, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s20 +; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s20, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s20, s38 +; GFX9-NEXT: s_and_b32 s58, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_lshr_b32 s20, s20, 16 +; GFX9-NEXT: s_add_i32 s38, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s39, s58, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s39, s38 +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s19, v1 +; GFX9-NEXT: v_readfirstlane_b32 s19, v2 +; GFX9-NEXT: s_lshr_b32 s69, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s19, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s19 +; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s19, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s19, s19, s38 +; GFX9-NEXT: s_and_b32 s58, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 +; GFX9-NEXT: v_readfirstlane_b32 s58, v2 +; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_lshr_b32 s19, s19, 16 +; GFX9-NEXT: s_add_i32 s38, s59, 0x7fff +; GFX9-NEXT: s_or_b32 s39, s58, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_cselect_b32 s58, s39, s38 +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v1, s18, v1 +; GFX9-NEXT: v_readfirstlane_b32 s18, v1 +; GFX9-NEXT: s_lshr_b32 s38, s58, 16 +; GFX9-NEXT: s_bfe_u32 s58, s18, 0x10010 +; GFX9-NEXT: s_add_i32 s58, s58, s18 +; GFX9-NEXT: s_add_i32 s39, s58, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[23:24] -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[21:22] -; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s13 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] -; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s6 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 -; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[17:18] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] -; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[74:75], 24 -; GFX9-NEXT: s_lshr_b64 s[38:39], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[48:49], s[62:63], 24 -; GFX9-NEXT: s_lshr_b64 s[50:51], s[60:61], 24 -; GFX9-NEXT: s_lshr_b32 s9, s7, 24 -; GFX9-NEXT: s_lshr_b32 s10, s7, 8 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s43, s6, 8 -; GFX9-NEXT: s_lshr_b32 s45, s75, 24 -; GFX9-NEXT: s_lshr_b32 s75, s75, 8 -; GFX9-NEXT: s_lshr_b32 s79, s74, 16 -; GFX9-NEXT: s_lshr_b32 s74, s74, 8 -; GFX9-NEXT: s_lshr_b32 s88, s73, 24 -; GFX9-NEXT: s_lshr_b32 s73, s73, 8 -; GFX9-NEXT: s_lshr_b32 s89, s72, 16 -; GFX9-NEXT: s_lshr_b32 s72, s72, 8 -; GFX9-NEXT: s_lshr_b32 s90, s63, 24 -; GFX9-NEXT: s_lshr_b32 s63, s63, 8 -; GFX9-NEXT: s_lshr_b32 s91, s62, 16 -; GFX9-NEXT: s_lshr_b32 s62, s62, 8 -; GFX9-NEXT: s_lshr_b32 s92, s61, 24 -; GFX9-NEXT: s_lshr_b32 s61, s61, 8 -; GFX9-NEXT: s_lshr_b32 s93, s60, 16 -; GFX9-NEXT: s_lshr_b32 s60, s60, 8 -; GFX9-NEXT: s_lshr_b32 s94, s59, 24 -; GFX9-NEXT: s_lshr_b32 s59, s59, 8 -; GFX9-NEXT: s_lshr_b32 s95, s58, 16 -; GFX9-NEXT: s_lshr_b32 s58, s58, 8 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s57, 24 -; GFX9-NEXT: s_lshr_b32 s57, s57, 8 -; GFX9-NEXT: s_lshr_b32 vcc_hi, s56, 16 -; GFX9-NEXT: s_lshr_b32 s56, s56, 8 -; GFX9-NEXT: s_lshr_b32 s30, s47, 24 -; GFX9-NEXT: s_lshr_b32 s47, s47, 8 -; GFX9-NEXT: s_lshr_b32 s8, s46, 16 -; GFX9-NEXT: s_lshr_b32 s7, s46, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: s_branch .LBB91_5 -; GFX9-NEXT: .LBB91_3: -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s78, 0 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s79, 1 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr38 -; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr8 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr86 -; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr9 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr97 -; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr87 -; GFX9-NEXT: ; implicit-def: $sgpr10 -; GFX9-NEXT: ; implicit-def: $sgpr85 -; GFX9-NEXT: ; implicit-def: $sgpr54 -; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr98 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr96 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr50 -; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr99 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr64 -; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr52 -; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr7 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr44 -; GFX9-NEXT: ; implicit-def: $sgpr42 -; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s78, 2 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s79, 3 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s78, 4 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s79, 5 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s78, 6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s79, 7 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s78, 8 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: v_writelane_b32 v62, s79, 9 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: ; kill: killed $sgpr6 -; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: s_branch .LBB91_2 -; GFX9-NEXT: .LBB91_4: -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v1, s76 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v1, s77 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s19, s69 +; GFX9-NEXT: s_cselect_b32 s18, s18, s39 +; GFX9-NEXT: s_lshr_b32 s18, s18, 16 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s59, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s18, s38 +; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 14 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s59, 8 +; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 15 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s58, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s21, s68 +; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 16 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s58, 8 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[58:59], 24 +; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 17 +; GFX9-NEXT: s_lshr_b32 s59, s73, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s20, s72 +; GFX9-NEXT: v_writelane_b32 v21, s59, 18 +; GFX9-NEXT: s_lshr_b32 s59, s73, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 19 +; GFX9-NEXT: s_lshr_b32 s59, s72, 16 +; GFX9-NEXT: v_writelane_b32 v21, s59, 20 +; GFX9-NEXT: s_lshr_b32 s59, s72, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 21 +; GFX9-NEXT: s_lshr_b32 s59, s77, 24 +; GFX9-NEXT: v_writelane_b32 v21, s59, 22 +; GFX9-NEXT: s_lshr_b32 s59, s77, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 23 +; GFX9-NEXT: s_lshr_b32 s59, s76, 16 +; GFX9-NEXT: v_writelane_b32 v21, s59, 24 +; GFX9-NEXT: s_lshr_b32 s59, s76, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 25 +; GFX9-NEXT: s_lshr_b32 s59, s91, 24 +; GFX9-NEXT: v_writelane_b32 v21, s59, 26 +; GFX9-NEXT: s_lshr_b32 s59, s91, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 27 +; GFX9-NEXT: s_lshr_b32 s59, s90, 16 +; GFX9-NEXT: v_writelane_b32 v21, s59, 28 +; GFX9-NEXT: s_lshr_b32 s59, s90, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 29 +; GFX9-NEXT: s_lshr_b32 s59, s95, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s4, s46 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 +; GFX9-NEXT: v_writelane_b32 v21, s59, 30 +; GFX9-NEXT: s_lshr_b32 s59, s95, 8 +; GFX9-NEXT: v_writelane_b32 v21, s59, 31 +; GFX9-NEXT: s_lshr_b32 s59, s47, 24 +; GFX9-NEXT: s_lshr_b32 s73, s47, 8 +; GFX9-NEXT: s_lshr_b32 s77, s46, 16 +; GFX9-NEXT: s_lshr_b32 s91, s46, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 +; GFX9-NEXT: s_lshr_b32 s47, s61, 24 +; GFX9-NEXT: v_writelane_b32 v21, s47, 32 +; GFX9-NEXT: s_lshr_b32 s47, s61, 8 +; GFX9-NEXT: v_writelane_b32 v21, s47, 33 +; GFX9-NEXT: s_lshr_b32 s47, s60, 16 +; GFX9-NEXT: v_writelane_b32 v21, s47, 34 +; GFX9-NEXT: s_lshr_b32 s47, s60, 8 +; GFX9-NEXT: v_writelane_b32 v21, s47, 35 +; GFX9-NEXT: s_lshr_b32 s47, s63, 8 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[60:61], 24 +; GFX9-NEXT: v_writelane_b32 v21, s47, 36 +; GFX9-NEXT: s_lshr_b32 s47, s62, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s7, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s6, s56 +; GFX9-NEXT: s_lshr_b32 s38, s94, 16 +; GFX9-NEXT: s_lshr_b32 s48, s94, 8 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 +; GFX9-NEXT: s_lshr_b32 s50, s31, 24 +; GFX9-NEXT: s_lshr_b32 s51, s31, 8 +; GFX9-NEXT: s_lshr_b32 s53, s30, 16 +; GFX9-NEXT: s_lshr_b32 s71, s30, 8 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[30:31], 24 +; GFX9-NEXT: s_lshr_b32 s80, s35, 24 +; GFX9-NEXT: s_lshr_b32 s82, s35, 8 +; GFX9-NEXT: s_lshr_b32 s65, s34, 16 +; GFX9-NEXT: s_lshr_b32 s85, s34, 8 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], 24 +; GFX9-NEXT: s_lshr_b32 s66, s37, 24 +; GFX9-NEXT: s_lshr_b32 s97, s37, 8 +; GFX9-NEXT: s_lshr_b32 s99, s36, 16 +; GFX9-NEXT: s_lshr_b32 s67, s36, 8 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[36:37], 24 +; GFX9-NEXT: s_lshr_b32 s61, s63, 24 +; GFX9-NEXT: v_writelane_b32 v21, s47, 37 +; GFX9-NEXT: s_lshr_b32 s47, s62, 8 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 +; GFX9-NEXT: s_lshr_b32 s95, s57, 24 +; GFX9-NEXT: s_lshr_b32 s31, s57, 8 +; GFX9-NEXT: s_lshr_b32 s35, s56, 16 +; GFX9-NEXT: s_lshr_b32 s37, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[56:57], 24 +; GFX9-NEXT: v_writelane_b32 v21, s47, 38 +; GFX9-NEXT: s_lshr_b32 s63, s75, 24 +; GFX9-NEXT: s_lshr_b32 s47, s75, 8 +; GFX9-NEXT: s_lshr_b32 s54, s74, 16 +; GFX9-NEXT: s_lshr_b32 s39, s74, 8 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 +; GFX9-NEXT: s_lshr_b32 s49, s79, 24 +; GFX9-NEXT: s_lshr_b32 s55, s79, 8 +; GFX9-NEXT: s_lshr_b32 s52, s78, 16 +; GFX9-NEXT: s_lshr_b32 s70, s78, 8 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 +; GFX9-NEXT: s_lshr_b32 s64, s93, 24 +; GFX9-NEXT: s_lshr_b32 s81, s93, 8 +; GFX9-NEXT: s_lshr_b32 s83, s92, 16 +; GFX9-NEXT: s_lshr_b32 s84, s92, 8 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 +; GFX9-NEXT: s_lshr_b32 s86, s89, 24 +; GFX9-NEXT: s_lshr_b32 s87, s89, 8 +; GFX9-NEXT: s_lshr_b32 s96, s88, 16 +; GFX9-NEXT: s_lshr_b32 s98, s88, 8 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 +; GFX9-NEXT: v_writelane_b32 v21, s47, 39 +; GFX9-NEXT: .LBB91_3: ; %end +; GFX9-NEXT: s_lshl_b32 s47, s67, 8 +; GFX9-NEXT: s_and_b32 s44, s44, 0xff +; GFX9-NEXT: s_or_b32 s44, s44, s47 +; GFX9-NEXT: s_lshl_b32 s47, s36, 8 +; GFX9-NEXT: s_and_b32 s57, s99, 0xff +; GFX9-NEXT: s_or_b32 s47, s57, s47 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s47, s47, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s47 +; GFX9-NEXT: v_mov_b32_e32 v1, s44 +; GFX9-NEXT: s_and_b32 s44, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s97, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: v_readlane_b32 s45, v21, 8 +; GFX9-NEXT: s_and_b32 s45, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s47, s66, 8 +; GFX9-NEXT: s_or_b32 s45, s45, s47 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s45, s45, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: s_lshl_b32 s44, s85, 8 +; GFX9-NEXT: s_and_b32 s42, s42, 0xff +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: s_lshl_b32 s44, s34, 8 +; GFX9-NEXT: s_and_b32 s45, s65, 0xff +; GFX9-NEXT: s_or_b32 s44, s45, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s44, s44, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s42 +; GFX9-NEXT: s_and_b32 s42, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s82, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: v_readlane_b32 s43, v21, 9 +; GFX9-NEXT: s_and_b32 s43, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s80, 8 +; GFX9-NEXT: s_or_b32 s43, s43, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s43, s43, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, s42 +; GFX9-NEXT: s_lshl_b32 s42, s71, 8 +; GFX9-NEXT: s_and_b32 s28, s28, 0xff +; GFX9-NEXT: s_or_b32 s28, s28, s42 +; GFX9-NEXT: s_lshl_b32 s42, s30, 8 +; GFX9-NEXT: s_and_b32 s43, s53, 0xff +; GFX9-NEXT: s_or_b32 s42, s43, s42 +; GFX9-NEXT: s_and_b32 s28, s28, 0xffff +; GFX9-NEXT: s_lshl_b32 s42, s42, 16 +; GFX9-NEXT: s_or_b32 s28, s28, s42 +; GFX9-NEXT: v_mov_b32_e32 v5, s28 +; GFX9-NEXT: s_and_b32 s28, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s51, 8 +; GFX9-NEXT: s_or_b32 s28, s28, s29 +; GFX9-NEXT: v_readlane_b32 s29, v21, 10 +; GFX9-NEXT: s_and_b32 s29, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s50, 8 +; GFX9-NEXT: s_or_b32 s29, s29, s42 +; GFX9-NEXT: s_and_b32 s28, s28, 0xffff +; GFX9-NEXT: s_lshl_b32 s29, s29, 16 +; GFX9-NEXT: s_or_b32 s28, s28, s29 +; GFX9-NEXT: v_mov_b32_e32 v6, s28 +; GFX9-NEXT: s_lshl_b32 s28, s48, 8 +; GFX9-NEXT: s_and_b32 s26, s26, 0xff +; GFX9-NEXT: s_or_b32 s26, s26, s28 +; GFX9-NEXT: s_lshl_b32 s28, s94, 8 +; GFX9-NEXT: s_and_b32 s29, s38, 0xff +; GFX9-NEXT: s_or_b32 s28, s29, s28 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s28, s28, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s28 +; GFX9-NEXT: v_mov_b32_e32 v7, s26 +; GFX9-NEXT: s_and_b32 s26, s27, 0xff +; GFX9-NEXT: v_readlane_b32 s27, v21, 31 +; GFX9-NEXT: s_lshl_b32 s27, s27, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_readlane_b32 s27, v21, 11 +; GFX9-NEXT: v_readlane_b32 s28, v21, 30 +; GFX9-NEXT: s_and_b32 s27, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s28, s28, 8 +; GFX9-NEXT: s_or_b32 s27, s27, s28 +; GFX9-NEXT: s_and_b32 s26, s26, 0xffff +; GFX9-NEXT: s_lshl_b32 s27, s27, 16 +; GFX9-NEXT: s_or_b32 s26, s26, s27 +; GFX9-NEXT: v_mov_b32_e32 v8, s26 +; GFX9-NEXT: v_readlane_b32 s26, v21, 29 +; GFX9-NEXT: s_lshl_b32 s26, s26, 8 +; GFX9-NEXT: s_and_b32 s24, s24, 0xff +; GFX9-NEXT: v_readlane_b32 s27, v21, 28 +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: s_lshl_b32 s26, s90, 8 +; GFX9-NEXT: s_and_b32 s27, s27, 0xff +; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s26, s26, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: s_and_b32 s24, s25, 0xff +; GFX9-NEXT: v_readlane_b32 s25, v21, 27 +; GFX9-NEXT: s_lshl_b32 s25, s25, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: v_readlane_b32 s25, v21, 12 +; GFX9-NEXT: v_readlane_b32 s26, v21, 26 +; GFX9-NEXT: s_and_b32 s25, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s26, s26, 8 +; GFX9-NEXT: s_or_b32 s25, s25, s26 +; GFX9-NEXT: s_and_b32 s24, s24, 0xffff +; GFX9-NEXT: s_lshl_b32 s25, s25, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_readlane_b32 s24, v21, 25 +; GFX9-NEXT: s_lshl_b32 s24, s24, 8 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: v_readlane_b32 s25, v21, 24 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: s_lshl_b32 s24, s76, 8 +; GFX9-NEXT: s_and_b32 s25, s25, 0xff +; GFX9-NEXT: s_or_b32 s24, s25, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: s_and_b32 s22, s23, 0xff +; GFX9-NEXT: v_readlane_b32 s23, v21, 23 +; GFX9-NEXT: s_lshl_b32 s23, s23, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: v_readlane_b32 s23, v21, 13 +; GFX9-NEXT: v_readlane_b32 s24, v21, 22 +; GFX9-NEXT: s_and_b32 s23, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s24, 8 +; GFX9-NEXT: s_or_b32 s23, s23, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s22 +; GFX9-NEXT: v_readlane_b32 s22, v21, 21 +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: v_readlane_b32 s23, v21, 20 +; GFX9-NEXT: s_or_b32 s20, s20, s22 +; GFX9-NEXT: s_lshl_b32 s22, s72, 8 +; GFX9-NEXT: s_and_b32 s23, s23, 0xff +; GFX9-NEXT: s_or_b32 s22, s23, s22 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s22, s22, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s22 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: s_and_b32 s20, s21, 0xff +; GFX9-NEXT: v_readlane_b32 s21, v21, 19 +; GFX9-NEXT: s_lshl_b32 s21, s21, 8 +; GFX9-NEXT: v_readlane_b32 s22, v21, 18 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: s_and_b32 s21, s68, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s22 +; GFX9-NEXT: s_and_b32 s20, s20, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_readlane_b32 s20, v21, 17 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s20 +; GFX9-NEXT: v_readlane_b32 s20, v21, 16 +; GFX9-NEXT: s_and_b32 s20, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s58, 8 +; GFX9-NEXT: s_or_b32 s20, s20, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s20, s20, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s20 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: v_readlane_b32 s19, v21, 15 +; GFX9-NEXT: s_lshl_b32 s19, s19, 8 +; GFX9-NEXT: v_readlane_b32 s20, v21, 14 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s69, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s20 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s98, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s88, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s20 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s87, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: v_readlane_b32 s19, v21, 0 +; GFX9-NEXT: s_and_b32 s19, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s86, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s20 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s84, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s83, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s92, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s81, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 1 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s70, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s78, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s55, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: v_readlane_b32 s15, v21, 2 +; GFX9-NEXT: s_and_b32 s15, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s49, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s39, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s74, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: v_readlane_b32 s13, v21, 39 +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: v_readlane_b32 s13, v21, 3 +; GFX9-NEXT: s_and_b32 s13, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s63, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 38 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 37 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s62, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: v_readlane_b32 s11, v21, 36 +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: v_readlane_b32 s11, v21, 4 +; GFX9-NEXT: s_and_b32 s11, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s61, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 35 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 34 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s60, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v46, s51 -; GFX9-NEXT: v_mov_b32_e32 v56, s50 -; GFX9-NEXT: v_mov_b32_e32 v58, s49 -; GFX9-NEXT: v_mov_b32_e32 v60, s48 -; GFX9-NEXT: v_mov_b32_e32 v27, s39 -; GFX9-NEXT: v_mov_b32_e32 v29, s38 -; GFX9-NEXT: v_mov_b32_e32 v10, s34 -; GFX9-NEXT: v_mov_b32_e32 v11, s36 -; GFX9-NEXT: v_readlane_b32 s34, v62, 8 -; GFX9-NEXT: v_readlane_b32 s36, v62, 6 -; GFX9-NEXT: v_readlane_b32 s38, v62, 4 -; GFX9-NEXT: v_readlane_b32 s48, v62, 2 -; GFX9-NEXT: v_readlane_b32 s50, v62, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, s46 -; GFX9-NEXT: v_mov_b32_e32 v41, s47 -; GFX9-NEXT: v_mov_b32_e32 v55, s15 -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: v_mov_b32_e32 v54, s57 -; GFX9-NEXT: v_mov_b32_e32 v52, s14 -; GFX9-NEXT: v_mov_b32_e32 v53, s58 -; GFX9-NEXT: v_mov_b32_e32 v51, s59 -; GFX9-NEXT: v_mov_b32_e32 v49, s13 -; GFX9-NEXT: v_mov_b32_e32 v50, s60 -; GFX9-NEXT: v_mov_b32_e32 v48, s61 -; GFX9-NEXT: v_mov_b32_e32 v38, s12 -; GFX9-NEXT: v_mov_b32_e32 v39, s62 -; GFX9-NEXT: v_mov_b32_e32 v37, s63 -; GFX9-NEXT: v_mov_b32_e32 v35, s11 -; GFX9-NEXT: v_mov_b32_e32 v36, s72 -; GFX9-NEXT: v_mov_b32_e32 v34, s73 -; GFX9-NEXT: v_mov_b32_e32 v32, s10 -; GFX9-NEXT: v_mov_b32_e32 v33, s74 -; GFX9-NEXT: v_mov_b32_e32 v31, s75 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v26, s53 -; GFX9-NEXT: v_mov_b32_e32 v25, s52 -; GFX9-NEXT: v_mov_b32_e32 v6, s70 -; GFX9-NEXT: v_mov_b32_e32 v12, s7 -; GFX9-NEXT: v_mov_b32_e32 v44, s6 -; GFX9-NEXT: v_mov_b32_e32 v23, s71 -; GFX9-NEXT: v_mov_b32_e32 v43, s67 -; GFX9-NEXT: v_mov_b32_e32 v24, s69 -; GFX9-NEXT: v_mov_b32_e32 v21, s68 -; GFX9-NEXT: v_mov_b32_e32 v45, s64 -; GFX9-NEXT: v_mov_b32_e32 v22, s66 -; GFX9-NEXT: v_mov_b32_e32 v19, s65 -; GFX9-NEXT: v_mov_b32_e32 v47, s99 -; GFX9-NEXT: v_mov_b32_e32 v20, s55 -; GFX9-NEXT: v_mov_b32_e32 v17, s54 -; GFX9-NEXT: v_mov_b32_e32 v57, s96 -; GFX9-NEXT: v_mov_b32_e32 v18, s98 -; GFX9-NEXT: v_mov_b32_e32 v15, s97 -; GFX9-NEXT: v_mov_b32_e32 v59, s85 -; GFX9-NEXT: v_mov_b32_e32 v16, s87 -; GFX9-NEXT: v_mov_b32_e32 v13, s86 -; GFX9-NEXT: v_mov_b32_e32 v61, s82 -; GFX9-NEXT: v_mov_b32_e32 v14, s84 -; GFX9-NEXT: v_mov_b32_e32 v7, s83 -; GFX9-NEXT: v_mov_b32_e32 v28, s80 -; GFX9-NEXT: v_mov_b32_e32 v8, s81 -; GFX9-NEXT: v_mov_b32_e32 v1, s78 -; GFX9-NEXT: v_mov_b32_e32 v2, s88 -; GFX9-NEXT: v_mov_b32_e32 v3, s90 -; GFX9-NEXT: v_mov_b32_e32 v4, s92 -; GFX9-NEXT: v_mov_b32_e32 v5, s94 -; GFX9-NEXT: v_mov_b32_e32 v9, s30 -; GFX9-NEXT: v_readlane_b32 s11, v62, 10 -; GFX9-NEXT: v_readlane_b32 s12, v62, 11 -; GFX9-NEXT: v_readlane_b32 s13, v62, 12 -; GFX9-NEXT: v_readlane_b32 s14, v62, 13 -; GFX9-NEXT: v_readlane_b32 s15, v62, 14 -; GFX9-NEXT: v_readlane_b32 s76, v62, 15 -; GFX9-NEXT: v_readlane_b32 s77, v62, 16 -; GFX9-NEXT: v_readlane_b32 s78, v62, 17 -; GFX9-NEXT: v_readlane_b32 s9, v62, 18 -; GFX9-NEXT: v_readlane_b32 s10, v62, 19 -; GFX9-NEXT: v_readlane_b32 s41, v62, 20 -; GFX9-NEXT: v_readlane_b32 s43, v62, 21 -; GFX9-NEXT: v_readlane_b32 s45, v62, 22 -; GFX9-NEXT: v_readlane_b32 s75, v62, 23 -; GFX9-NEXT: v_readlane_b32 s79, v62, 24 -; GFX9-NEXT: v_readlane_b32 s74, v62, 25 -; GFX9-NEXT: v_readlane_b32 s88, v62, 26 -; GFX9-NEXT: v_readlane_b32 s73, v62, 27 -; GFX9-NEXT: v_readlane_b32 s89, v62, 28 -; GFX9-NEXT: v_readlane_b32 s72, v62, 29 -; GFX9-NEXT: v_readlane_b32 s90, v62, 30 -; GFX9-NEXT: v_readlane_b32 s63, v62, 31 -; GFX9-NEXT: v_readlane_b32 s91, v62, 32 -; GFX9-NEXT: v_readlane_b32 s62, v62, 33 -; GFX9-NEXT: v_readlane_b32 s92, v62, 34 -; GFX9-NEXT: v_readlane_b32 s61, v62, 35 -; GFX9-NEXT: v_readlane_b32 s93, v62, 36 -; GFX9-NEXT: v_readlane_b32 s60, v62, 37 -; GFX9-NEXT: v_readlane_b32 s94, v62, 38 -; GFX9-NEXT: v_readlane_b32 s59, v62, 39 -; GFX9-NEXT: v_readlane_b32 s95, v62, 40 -; GFX9-NEXT: v_readlane_b32 s58, v62, 41 -; GFX9-NEXT: v_readlane_b32 vcc_lo, v62, 42 -; GFX9-NEXT: v_readlane_b32 s57, v62, 43 -; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 44 -; GFX9-NEXT: v_readlane_b32 s56, v62, 45 -; GFX9-NEXT: v_readlane_b32 s30, v62, 46 -; GFX9-NEXT: v_readlane_b32 s47, v62, 47 -; GFX9-NEXT: v_readlane_b32 s8, v62, 48 -; GFX9-NEXT: v_readlane_b32 s7, v62, 49 -; GFX9-NEXT: v_readlane_b32 s35, v62, 9 -; GFX9-NEXT: v_readlane_b32 s37, v62, 7 -; GFX9-NEXT: v_readlane_b32 s39, v62, 5 -; GFX9-NEXT: v_readlane_b32 s49, v62, 3 -; GFX9-NEXT: v_readlane_b32 s51, v62, 1 -; GFX9-NEXT: .LBB91_5: ; %end -; GFX9-NEXT: s_and_b32 s6, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s44, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s47, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s11, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s30, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s56, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, vcc_hi, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s42, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s57, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s12, 0xff -; GFX9-NEXT: s_lshl_b32 s8, vcc_lo, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s58, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s95, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s40, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s59, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s13, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s94, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s60, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s93, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s50, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s61, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s14, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s92, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s62, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s91, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s48, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s63, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s15, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s90, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s72, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s89, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s38, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s27, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s73, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s76, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s88, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s74, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s79, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s36, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_readlane_b32 s9, v21, 33 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: v_readlane_b32 s9, v21, 5 +; GFX9-NEXT: v_readlane_b32 s10, v21, 32 +; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s37, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 -; GFX9-NEXT: s_and_b32 s6, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s31, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s45, 8 +; GFX9-NEXT: v_readlane_b32 s7, v21, 6 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s95, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_lshl_b32 s6, s91, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s41, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s34, 8 +; GFX9-NEXT: s_and_b32 s6, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s46, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s10, 8 +; GFX9-NEXT: s_lshl_b32 s5, s73, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s78, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s9, 8 +; GFX9-NEXT: v_readlane_b32 s5, v21, 7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s59, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: v_mov_b32_e32 v30, s4 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_readlane_b32 s99, v63, 35 -; GFX9-NEXT: v_readlane_b32 s98, v63, 34 -; GFX9-NEXT: v_readlane_b32 s97, v63, 33 -; GFX9-NEXT: v_readlane_b32 s96, v63, 32 -; GFX9-NEXT: v_readlane_b32 s87, v63, 31 -; GFX9-NEXT: v_readlane_b32 s86, v63, 30 -; GFX9-NEXT: v_readlane_b32 s85, v63, 29 -; GFX9-NEXT: v_readlane_b32 s84, v63, 28 -; GFX9-NEXT: v_readlane_b32 s83, v63, 27 -; GFX9-NEXT: v_readlane_b32 s82, v63, 26 -; GFX9-NEXT: v_readlane_b32 s81, v63, 25 -; GFX9-NEXT: v_readlane_b32 s80, v63, 24 -; GFX9-NEXT: v_readlane_b32 s71, v63, 23 -; GFX9-NEXT: v_readlane_b32 s70, v63, 22 -; GFX9-NEXT: v_readlane_b32 s69, v63, 21 -; GFX9-NEXT: v_readlane_b32 s68, v63, 20 -; GFX9-NEXT: v_readlane_b32 s67, v63, 19 -; GFX9-NEXT: v_readlane_b32 s66, v63, 18 -; GFX9-NEXT: v_readlane_b32 s65, v63, 17 -; GFX9-NEXT: v_readlane_b32 s64, v63, 16 -; GFX9-NEXT: v_readlane_b32 s55, v63, 15 -; GFX9-NEXT: v_readlane_b32 s54, v63, 14 -; GFX9-NEXT: v_readlane_b32 s53, v63, 13 -; GFX9-NEXT: v_readlane_b32 s52, v63, 12 -; GFX9-NEXT: v_readlane_b32 s51, v63, 11 -; GFX9-NEXT: v_readlane_b32 s50, v63, 10 -; GFX9-NEXT: v_readlane_b32 s49, v63, 9 -; GFX9-NEXT: v_readlane_b32 s48, v63, 8 -; GFX9-NEXT: v_readlane_b32 s39, v63, 7 -; GFX9-NEXT: v_readlane_b32 s38, v63, 6 -; GFX9-NEXT: v_readlane_b32 s37, v63, 5 -; GFX9-NEXT: v_readlane_b32 s36, v63, 4 -; GFX9-NEXT: v_readlane_b32 s35, v63, 3 -; GFX9-NEXT: v_readlane_b32 s34, v63, 2 -; GFX9-NEXT: v_readlane_b32 s31, v63, 1 -; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v10 -; GFX9-NEXT: v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s99, v20, 35 +; GFX9-NEXT: v_readlane_b32 s98, v20, 34 +; GFX9-NEXT: v_readlane_b32 s97, v20, 33 +; GFX9-NEXT: v_readlane_b32 s96, v20, 32 +; GFX9-NEXT: v_readlane_b32 s87, v20, 31 +; GFX9-NEXT: v_readlane_b32 s86, v20, 30 +; GFX9-NEXT: v_readlane_b32 s85, v20, 29 +; GFX9-NEXT: v_readlane_b32 s84, v20, 28 +; GFX9-NEXT: v_readlane_b32 s83, v20, 27 +; GFX9-NEXT: v_readlane_b32 s82, v20, 26 +; GFX9-NEXT: v_readlane_b32 s81, v20, 25 +; GFX9-NEXT: v_readlane_b32 s80, v20, 24 +; GFX9-NEXT: v_readlane_b32 s71, v20, 23 +; GFX9-NEXT: v_readlane_b32 s70, v20, 22 +; GFX9-NEXT: v_readlane_b32 s69, v20, 21 +; GFX9-NEXT: v_readlane_b32 s68, v20, 20 +; GFX9-NEXT: v_readlane_b32 s67, v20, 19 +; GFX9-NEXT: v_readlane_b32 s66, v20, 18 +; GFX9-NEXT: v_readlane_b32 s65, v20, 17 +; GFX9-NEXT: v_readlane_b32 s64, v20, 16 +; GFX9-NEXT: v_readlane_b32 s55, v20, 15 +; GFX9-NEXT: v_readlane_b32 s54, v20, 14 +; GFX9-NEXT: v_readlane_b32 s53, v20, 13 +; GFX9-NEXT: v_readlane_b32 s52, v20, 12 +; GFX9-NEXT: v_readlane_b32 s51, v20, 11 +; GFX9-NEXT: v_readlane_b32 s50, v20, 10 +; GFX9-NEXT: v_readlane_b32 s49, v20, 9 +; GFX9-NEXT: v_readlane_b32 s48, v20, 8 +; GFX9-NEXT: v_readlane_b32 s39, v20, 7 +; GFX9-NEXT: v_readlane_b32 s38, v20, 6 +; GFX9-NEXT: v_readlane_b32 s37, v20, 5 +; GFX9-NEXT: v_readlane_b32 s36, v20, 4 +; GFX9-NEXT: v_readlane_b32 s35, v20, 3 +; GFX9-NEXT: v_readlane_b32 s34, v20, 2 +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB91_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s96, 0 -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s72, v1 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v2 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s46, v9 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v10 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v11 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v12 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v13 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v14 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5 -; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s102, 6 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s103, 7 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s104, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 11 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 12 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 13 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 14 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 15 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 17 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 18 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 19 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s68, 20 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s69, 21 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s70, 22 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s71, 23 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s80, 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s81, 25 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s82, 26 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s83, 27 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s84, 28 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s85, 29 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s86, 30 -; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31 -; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s2, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s1, 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s0, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s43, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s43, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s43, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s42, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s42, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s45, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s45, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s45, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 19 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s44, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s47, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s47, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s47, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s46, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s46, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s59, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s59, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s58, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s58, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s61, 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s61, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s61, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s60, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s60, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s63, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s63, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 12 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s63, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s62, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s62, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s73, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s73, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s73, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s72, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s72, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s29, 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 26 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s29, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s28, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s28, 8 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[42:43], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[46:47], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8 -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s44, 16 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 6 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 7 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 5 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 2 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 3 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 0 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 1 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4 -; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true -; GFX11-TRUE16-NEXT: s_and_b32 s5, s29, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s29, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000 -; GFX11-TRUE16-NEXT: s_and_b32 s15, s45, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s45, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s43, 0xffff0000 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s43, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s6, s73, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s77, s73, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s76, s72, 0xffff0000 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s11, s63, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s74, s63, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s73, s62, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s63, s61, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo -; GFX11-TRUE16-NEXT: s_and_b32 s61, s60, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s60, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s40, s59, 0xffff0000 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s59, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s29, s58, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s58, 16 -; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s45, 0x10010 -; GFX11-TRUE16-NEXT: s_and_b32 s12, s47, 0xffff0000 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s4, s45 -; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s47, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s47, s46, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s41, s46, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s10, s44, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s44, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s42, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s42, 16 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_bitset1_b32 s45, 22 -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s45, s43 -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s78 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s42, 16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v6 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s1, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.l -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s1 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s43 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s0, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v25.l -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6 -; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s42, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, 16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v8, v3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s0, 0x10010 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s0 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s42 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s3, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s76 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s75 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s42, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s3, 0x10010 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s3 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s2, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s74 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v26.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v28.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 -; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s42, s11 -; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s2, 0x10010 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s2 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s42 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s17, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s73 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v30.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s72 -; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43 -; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s42, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.l -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s17, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v31.l -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s17 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s43 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s16, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s63 -; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[4:5] -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43 -; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s62 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s42, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s16, 0x10010 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s16 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s43 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s19, 0xffff0000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s61 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s57 -; GFX11-TRUE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43 -; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s42, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s19, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.l -; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s19 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s43 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s18, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s56 -; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v34.l -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s17, s60 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s16, s44 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40 -; GFX11-TRUE16-NEXT: s_bfe_u32 s40, s42, 0x10010 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, s42 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s40, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s42, s40 -; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s40, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s18, 0x10010 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s18 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s42 -; GFX11-TRUE16-NEXT: s_and_b32 s42, s21, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s29 -; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s29, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s29, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s29 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s29, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14 -; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s29, s29, s42 -; GFX11-TRUE16-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s29, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s2, s11 -; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s14, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.l -; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s14 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s21, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s21 -; GFX11-TRUE16-NEXT: s_and_b32 s21, s20, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s13 -; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v37.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.l -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s3, s59 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s12 -; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s13, 0x10010 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s13 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s13, s12 -; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s20, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s12, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s13, 0x10010 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s13 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s20, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s13, s14 -; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s47 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-TRUE16-NEXT: s_bfe_u32 s20, s14, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_add_i32 s29, s20, s14 -; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s13, 16 -; GFX11-TRUE16-NEXT: s_addk_i32 s29, 0x7fff -; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s29 -; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s41 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s13, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s23, s14, 0x10010 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s6 -; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, s14 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s23, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s23 -; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s13, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v38.l -; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s15 -; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v48.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s22, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15 -; GFX11-TRUE16-NEXT: s_and_b32 s15, s25, 0xffff0000 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s14, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s10 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l -; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s10, 0x10010 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v7, 16, 1 -; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s10 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s9, 16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s8, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v51.l -; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s8 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s10 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v8 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v50.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v49.l -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s9, 0x10010 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s9 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s9, s7 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v53.l -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s8, 0x10010 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s8 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s8, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s27, 0xffff0000 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s5, 16 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s4, 0x10010 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s4 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s7 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v54.l -; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s5, 0x10010 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v52.l -; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s5, s7 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v55.l -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[22:23] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[20:21] -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v2.l -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17] -; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63 -; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s12 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s14, 16 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v65 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[64:65] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[68:69] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v65 -; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s11, 0x10010 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v64 -; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s11 -; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22 -; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff -; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s11, s12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s27, s73 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s26, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 vcc, s[46:47], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 -; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s47, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s47, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s46, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s46, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s45, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s45, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s44, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s43, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s42, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s42, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s29, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 8 -; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 8 -; GFX11-TRUE16-NEXT: s_branch .LBB91_5 -; GFX11-TRUE16-NEXT: .LBB91_3: -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 1 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 3 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 5 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 6 -; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 7 -; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-TRUE16-NEXT: s_branch .LBB91_2 -; GFX11-TRUE16-NEXT: .LBB91_4: -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s94 :: v_dual_mov_b32 v14, s30 -; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s49 :: v_dual_mov_b32 v5, s35 -; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 3 -; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 0 -; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v43, 4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, s42 :: v_dual_mov_b32 v54, s43 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s44 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v49, s98 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, s46 :: v_dual_mov_b32 v38, s47 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s97 :: v_dual_mov_b32 v39, s58 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v36, s60 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v32, s61 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v33, s62 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s63 :: v_dual_mov_b32 v30, s72 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s96 :: v_dual_mov_b32 v26, s73 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s7 :: v_dual_mov_b32 v27, s28 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s87 :: v_dual_mov_b32 v64, s86 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v10, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v68, s48 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v66, s84 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s83 :: v_dual_mov_b32 v69, s70 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s82 :: v_dual_mov_b32 v23, s80 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v71, s66 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s39 :: v_dual_mov_b32 v21, s69 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v81, s55 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v19, s65 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v83, s51 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v17, s54 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v11, s52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v12, s50 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s78 :: v_dual_mov_b32 v7, s88 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 -; GFX11-TRUE16-NEXT: s_mov_b32 s58, s11 -; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 8 -; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 9 -; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 10 -; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 11 -; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 12 -; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v43, 13 -; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v43, 14 -; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v43, 15 -; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v43, 16 -; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 17 -; GFX11-TRUE16-NEXT: v_readlane_b32 s46, v43, 18 -; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v43, 19 -; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v43, 20 -; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 21 -; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 22 -; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 23 -; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 24 -; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 25 -; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 26 -; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 27 -; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 28 -; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 29 -; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 30 -; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 31 -; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v42, 0 -; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v42, 2 -; GFX11-TRUE16-NEXT: v_readlane_b32 s92, v42, 3 -; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v42, 5 -; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 -; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v42, 7 -; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v42, 8 -; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v43, 5 -; GFX11-TRUE16-NEXT: .LBB91_5: ; %end -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s104, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s103, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s56, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s102, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s6, s58, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s100, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s99, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 -; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s43, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s6, s59, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s95, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s44, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s93, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s14, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s17, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s45, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s60, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s92, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s91, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s90, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s12, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s89, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s6, s61, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s88, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s79, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s78, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s30, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s21, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s77, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s62, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s76, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s22, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s8, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s75, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s94, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s9, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s6, s63, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s74, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s10, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s34, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s11, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xff -; GFX11-TRUE16-NEXT: s_and_b32 s3, s72, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s47, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s3, s26, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s46, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s3, s41, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, vcc_lo, 8 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s15, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s13, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xff -; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v27, 0xff, v27 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v4, 8, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v5, 8, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v4, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v26, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v25, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v29 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v83 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v26, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v28, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v11, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v70 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v18, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v17, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v68 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v21, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v10, v3 -; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:112 -; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 8 -; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 7 -; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v41, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v41, 5 -; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v41, 4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v41, 3 -; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v41, 2 -; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v41, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v41, 0 -; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v40, 31 -; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v40, 30 -; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v40, 29 -; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v40, 28 -; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v40, 27 -; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v40, 26 -; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v40, 25 -; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v40, 24 -; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v40, 23 -; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v40, 22 -; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v40, 21 -; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v40, 20 -; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v40, 19 -; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v40, 18 -; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v40, 17 -; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v40, 16 -; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v40, 15 -; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v40, 14 -; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v40, 13 -; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v40, 12 -; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v40, 11 -; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 -; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 -; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 -; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 -; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 -; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 -; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 -; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s96, 0 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s72, v1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v2 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s97, 1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v3 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v4 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v5 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s98, 2 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v6 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5 -; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s103, 7 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s104, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 10 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 11 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 12 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 13 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 14 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 15 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 17 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 18 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 19 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s68, 20 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s69, 21 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s70, 22 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s71, 23 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s80, 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s81, 25 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s82, 26 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s83, 27 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s84, 28 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s85, 29 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s86, 30 -; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31 -; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8 -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4 -; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true -; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 -; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000 -; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010 -; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6 -; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77 -; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22 -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76 -; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5 -; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22 -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 -; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 -; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8 -; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45 -; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44 -; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28 -; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18 -; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s18, 22 -; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44 -; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41 -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29 -; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29 -; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10 -; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34 -; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28 -; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9 -; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 -; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28 -; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28 -; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16 -; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff -; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 -; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29 -; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28 -; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14 -; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12 -; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15 -; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 -; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 -; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12 -; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14 -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 -; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5 -; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12 -; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11 -; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18] -; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] -; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71] -; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] -; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11 -; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 -; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff -; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60 -; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 -; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24 -; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8 -; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8 -; GFX11-FAKE16-NEXT: s_branch .LBB91_5 -; GFX11-FAKE16-NEXT: .LBB91_3: -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6 -; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7 -; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 -; GFX11-FAKE16-NEXT: s_branch .LBB91_2 -; GFX11-FAKE16-NEXT: .LBB91_4: -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30 -; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 -; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3 -; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 -; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11 -; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8 -; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9 -; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10 -; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11 -; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12 -; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13 -; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14 -; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15 -; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16 -; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17 -; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18 -; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19 -; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20 -; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21 -; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22 -; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23 -; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24 -; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25 -; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26 -; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27 -; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28 -; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29 -; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30 -; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31 -; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1 -; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2 -; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3 -; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4 -; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5 -; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 -; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1 -; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7 -; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8 -; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5 -; GFX11-FAKE16-NEXT: .LBB91_5: ; %end -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 -; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff -; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8 -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff -; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5 -; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8 -; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7 -; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v41, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v41, 5 -; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v41, 4 -; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v41, 3 -; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v41, 2 -; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v41, 1 -; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v41, 0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v40, 31 -; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v40, 30 -; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v40, 29 -; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v40, 28 -; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v40, 27 -; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v40, 26 -; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v40, 25 -; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v40, 24 -; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v40, 23 -; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v40, 22 -; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v40, 21 -; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v40, 20 -; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v40, 19 -; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v40, 18 -; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v40, 17 -; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v40, 16 -; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v40, 15 -; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v40, 14 -; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v40, 13 -; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v40, 12 -; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v40, 11 -; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v40, 10 -; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 -; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 -; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 -; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 -; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 -; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 -; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 -; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: s_clause 0x2 ; 12-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v18, s32 +; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v20, s32 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v18, s30, 0 +; GFX11-NEXT: v_writelane_b32 v19, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_writelane_b32 v18, s31, 1 +; GFX11-NEXT: v_writelane_b32 v19, s97, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v1 +; GFX11-NEXT: v_writelane_b32 v18, s34, 2 +; GFX11-NEXT: v_writelane_b32 v19, s98, 2 +; GFX11-NEXT: v_readfirstlane_b32 s29, v2 +; GFX11-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-NEXT: v_writelane_b32 v18, s35, 3 +; GFX11-NEXT: v_writelane_b32 v19, s99, 3 +; GFX11-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_writelane_b32 v18, s36, 4 +; GFX11-NEXT: v_writelane_b32 v19, s100, 4 +; GFX11-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-NEXT: v_writelane_b32 v18, s37, 5 +; GFX11-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_writelane_b32 v18, s38, 6 +; GFX11-NEXT: v_writelane_b32 v19, s102, 6 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr20 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v18, s39, 7 +; GFX11-NEXT: v_writelane_b32 v19, s103, 7 +; GFX11-NEXT: v_writelane_b32 v18, s48, 8 +; GFX11-NEXT: v_writelane_b32 v19, s104, 8 +; GFX11-NEXT: s_mov_b32 s104, 0 +; GFX11-NEXT: v_writelane_b32 v18, s49, 9 +; GFX11-NEXT: v_writelane_b32 v18, s50, 10 +; GFX11-NEXT: v_writelane_b32 v18, s51, 11 +; GFX11-NEXT: v_writelane_b32 v18, s52, 12 +; GFX11-NEXT: v_writelane_b32 v18, s53, 13 +; GFX11-NEXT: v_writelane_b32 v18, s54, 14 +; GFX11-NEXT: v_writelane_b32 v18, s55, 15 +; GFX11-NEXT: v_writelane_b32 v18, s64, 16 +; GFX11-NEXT: v_writelane_b32 v18, s65, 17 +; GFX11-NEXT: v_writelane_b32 v18, s66, 18 +; GFX11-NEXT: v_writelane_b32 v18, s67, 19 +; GFX11-NEXT: v_writelane_b32 v18, s68, 20 +; GFX11-NEXT: v_writelane_b32 v18, s69, 21 +; GFX11-NEXT: v_writelane_b32 v18, s70, 22 +; GFX11-NEXT: v_writelane_b32 v18, s71, 23 +; GFX11-NEXT: v_writelane_b32 v18, s80, 24 +; GFX11-NEXT: v_writelane_b32 v18, s81, 25 +; GFX11-NEXT: v_writelane_b32 v18, s82, 26 +; GFX11-NEXT: v_writelane_b32 v18, s83, 27 +; GFX11-NEXT: v_writelane_b32 v18, s84, 28 +; GFX11-NEXT: v_writelane_b32 v18, s85, 29 +; GFX11-NEXT: v_writelane_b32 v18, s86, 30 +; GFX11-NEXT: v_writelane_b32 v18, s87, 31 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s46, s41, 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX11-NEXT: s_mov_b32 s91, s46 +; GFX11-NEXT: v_writelane_b32 v20, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 +; GFX11-NEXT: s_lshr_b32 s72, s7, 24 +; GFX11-NEXT: s_lshr_b32 s73, s7, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s77, s27, 24 +; GFX11-NEXT: s_lshr_b32 s76, s6, 16 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; GFX11-NEXT: v_writelane_b32 v20, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s88, s27, 8 +; GFX11-NEXT: s_mov_b32 s79, s72 +; GFX11-NEXT: v_writelane_b32 v20, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-NEXT: s_mov_b32 s93, s73 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[0:1], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v20, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-NEXT: s_lshr_b32 vcc_lo, s26, 16 +; GFX11-NEXT: s_lshr_b32 s56, s4, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: s_mov_b32 s95, s76 +; GFX11-NEXT: s_mov_b32 s73, s77 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v20, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX11-NEXT: s_mov_b32 s77, s88 +; GFX11-NEXT: v_writelane_b32 v20, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX11-NEXT: s_lshr_b32 s71, s27, 16 +; GFX11-NEXT: s_lshr_b32 s83, s26, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-NEXT: s_lshr_b32 s66, s25, 24 +; GFX11-NEXT: s_lshr_b32 s67, s25, 8 +; GFX11-NEXT: s_lshr_b32 s68, s24, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-NEXT: s_lshr_b32 s49, s24, 8 +; GFX11-NEXT: s_lshr_b32 s69, s23, 24 +; GFX11-NEXT: s_lshr_b32 s70, s23, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 14 +; GFX11-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-NEXT: s_lshr_b32 s80, s22, 8 +; GFX11-NEXT: s_lshr_b32 s58, s21, 24 +; GFX11-NEXT: v_writelane_b32 v20, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b32 s59, s21, 8 +; GFX11-NEXT: s_lshr_b32 s50, s20, 16 +; GFX11-NEXT: s_lshr_b32 s81, s20, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s85, s19, 24 +; GFX11-NEXT: s_lshr_b32 s60, s19, 8 +; GFX11-NEXT: s_lshr_b32 s61, s18, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s96, s18, 8 +; GFX11-NEXT: s_lshr_b32 s98, s17, 24 +; GFX11-NEXT: s_lshr_b32 s99, s17, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s53, s16, 16 +; GFX11-NEXT: s_lshr_b32 s43, s16, 8 +; GFX11-NEXT: s_lshr_b32 s102, s0, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s103, s0, 8 +; GFX11-NEXT: s_lshr_b32 s34, s6, 8 +; GFX11-NEXT: s_lshr_b32 s65, s9, 24 +; GFX11-NEXT: v_writelane_b32 v20, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-NEXT: s_lshr_b32 s35, s8, 16 +; GFX11-NEXT: s_lshr_b32 s36, s8, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s37, s11, 24 +; GFX11-NEXT: s_lshr_b32 s38, s11, 8 +; GFX11-NEXT: s_lshr_b32 s39, s10, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s48, s10, 8 +; GFX11-NEXT: s_lshr_b32 s84, s13, 24 +; GFX11-NEXT: s_lshr_b32 s82, s13, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 8 +; GFX11-NEXT: s_lshr_b32 s86, s12, 16 +; GFX11-NEXT: s_lshr_b32 s51, s12, 8 +; GFX11-NEXT: s_lshr_b32 s97, s15, 24 +; GFX11-NEXT: s_lshr_b32 s87, s15, 8 +; GFX11-NEXT: v_writelane_b32 v20, s44, 29 +; GFX11-NEXT: s_lshr_b32 s44, s28, 8 +; GFX11-NEXT: s_lshr_b32 s52, s14, 16 +; GFX11-NEXT: s_lshr_b32 s100, s14, 8 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: v_writelane_b32 v20, s44, 30 +; GFX11-NEXT: s_lshr_b32 s44, s41, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 16 +; GFX11-NEXT: s_lshr_b32 s101, s29, 8 +; GFX11-NEXT: s_lshr_b32 s54, s41, 16 +; GFX11-NEXT: v_writelane_b32 v20, s44, 31 +; GFX11-NEXT: s_lshr_b32 s44, s40, 16 +; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: s_mov_b32 s63, s56 +; GFX11-NEXT: s_mov_b32 s75, s57 +; GFX11-NEXT: v_writelane_b32 v20, s46, 2 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; GFX11-NEXT: s_mov_b32 s89, vcc_lo +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX11-NEXT: v_writelane_b32 v20, s47, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s46, 0 +; GFX11-NEXT: v_writelane_b32 v20, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v20, s46, 6 +; GFX11-NEXT: v_writelane_b32 v20, s47, 7 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s46, 4 +; GFX11-NEXT: v_writelane_b32 v20, s47, 5 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; GFX11-NEXT: s_branch .LBB91_3 +; GFX11-NEXT: .LBB91_2: +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: s_mov_b32 s104, -1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr59 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr89 +; GFX11-NEXT: ; implicit-def: $sgpr77 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr73 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr91 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr95 +; GFX11-NEXT: ; implicit-def: $sgpr93 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr75 +; GFX11-NEXT: ; implicit-def: $sgpr63 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v20, s42, 0 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v20, s46, 2 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v20, s47, 3 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4 +; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6 +; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7 +; GFX11-NEXT: .LBB91_3: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s104 +; GFX11-NEXT: s_mov_b32 s104, s54 +; GFX11-NEXT: s_mov_b32 s54, vcc_hi +; GFX11-NEXT: s_mov_b32 vcc_hi, s34 +; GFX11-NEXT: s_mov_b32 s34, s65 +; GFX11-NEXT: s_mov_b32 s65, s84 +; GFX11-NEXT: s_mov_b32 s84, s86 +; GFX11-NEXT: s_mov_b32 s86, s97 +; GFX11-NEXT: s_mov_b32 s97, s43 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_5 +; GFX11-NEXT: ; %bb.4: ; %cmp.true +; GFX11-NEXT: s_and_b32 s42, s41, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s42 +; GFX11-NEXT: v_readfirstlane_b32 s42, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s43, s42, 0x10010 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s43, s43, s42 +; GFX11-NEXT: s_bitset1_b32 s42, 22 +; GFX11-NEXT: s_addk_i32 s43, 0x7fff +; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s42, s42, s43 +; GFX11-NEXT: s_lshl_b32 s41, s41, 16 +; GFX11-NEXT: s_lshr_b32 s104, s42, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s41 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s41, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s43, s41, 0x10010 +; GFX11-NEXT: s_add_i32 s42, s43, s41 +; GFX11-NEXT: s_bitset1_b32 s41, 22 +; GFX11-NEXT: s_addk_i32 s42, 0x7fff +; GFX11-NEXT: s_and_b32 s43, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s41, s41, s42 +; GFX11-NEXT: s_and_b32 s42, s40, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s41, s41, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s31, s41, s104 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s42, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s43, s42, 0x10010 +; GFX11-NEXT: s_add_i32 s43, s43, s42 +; GFX11-NEXT: s_bitset1_b32 s42, 22 +; GFX11-NEXT: s_addk_i32 s43, 0x7fff +; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s42, s42, s43 +; GFX11-NEXT: s_lshl_b32 s40, s40, 16 +; GFX11-NEXT: s_lshr_b32 s42, s42, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s40 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s43, s40, 0x10010 +; GFX11-NEXT: s_add_i32 s43, s43, s40 +; GFX11-NEXT: s_bitset1_b32 s40, 22 +; GFX11-NEXT: s_addk_i32 s43, 0x7fff +; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s40, s40, s43 +; GFX11-NEXT: s_and_b32 s43, s29, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s40, s40, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s30, s40, s42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s43, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s44, s43, 0x10010 +; GFX11-NEXT: s_add_i32 s44, s44, s43 +; GFX11-NEXT: s_bitset1_b32 s43, 22 +; GFX11-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s43, s43, s44 +; GFX11-NEXT: s_lshl_b32 s29, s29, 16 +; GFX11-NEXT: s_lshr_b32 s54, s43, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s29, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s44, s29, 0x10010 +; GFX11-NEXT: s_add_i32 s43, s44, s29 +; GFX11-NEXT: s_bitset1_b32 s29, 22 +; GFX11-NEXT: s_addk_i32 s43, 0x7fff +; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s29, s29, s43 +; GFX11-NEXT: s_and_b32 s43, s28, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s29, s29, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s89, s29, s54 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s43, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s44, s43, 0x10010 +; GFX11-NEXT: s_add_i32 s44, s44, s43 +; GFX11-NEXT: s_bitset1_b32 s43, 22 +; GFX11-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s43, s43, s44 +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_lshr_b32 s43, s43, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s28, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s44, s28, 0x10010 +; GFX11-NEXT: s_add_i32 s44, s44, s28 +; GFX11-NEXT: s_bitset1_b32 s28, 22 +; GFX11-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s28, s28, s44 +; GFX11-NEXT: s_and_b32 s44, s15, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s28, s28, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s88, s28, s43 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s44, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-NEXT: s_add_i32 s45, s45, s44 +; GFX11-NEXT: s_bitset1_b32 s44, 22 +; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: s_lshl_b32 s15, s15, 16 +; GFX11-NEXT: s_lshr_b32 s72, s44, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s15 +; GFX11-NEXT: v_writelane_b32 v20, s72, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s15, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s45, s15, 0x10010 +; GFX11-NEXT: s_add_i32 s44, s45, s15 +; GFX11-NEXT: s_bitset1_b32 s15, 22 +; GFX11-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s15, s15, s44 +; GFX11-NEXT: s_and_b32 s44, s14, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s15, s15, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s77, s15, s72 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s44, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-NEXT: s_add_i32 s45, s45, s44 +; GFX11-NEXT: s_bitset1_b32 s44, 22 +; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s44, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s45, s14, 0x10010 +; GFX11-NEXT: s_add_i32 s45, s45, s14 +; GFX11-NEXT: s_bitset1_b32 s14, 22 +; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s14, s14, s45 +; GFX11-NEXT: s_and_b32 s45, s13, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s14, s14, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s76, s14, s44 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s45, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s46, s45, 0x10010 +; GFX11-NEXT: s_add_i32 s46, s46, s45 +; GFX11-NEXT: s_bitset1_b32 s45, 22 +; GFX11-NEXT: s_addk_i32 s46, 0x7fff +; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s45, s45, s46 +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s45, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s13 +; GFX11-NEXT: v_writelane_b32 v20, s73, 9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s46, s13, 0x10010 +; GFX11-NEXT: s_add_i32 s45, s46, s13 +; GFX11-NEXT: s_bitset1_b32 s13, 22 +; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s13, s13, s45 +; GFX11-NEXT: s_and_b32 s45, s12, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s13, s13, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s73, s13, s73 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s45, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s46, s45, 0x10010 +; GFX11-NEXT: s_add_i32 s46, s46, s45 +; GFX11-NEXT: s_bitset1_b32 s45, 22 +; GFX11-NEXT: s_addk_i32 s46, 0x7fff +; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s45, s45, s46 +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_lshr_b32 s45, s45, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s46, s12, 0x10010 +; GFX11-NEXT: s_add_i32 s46, s46, s12 +; GFX11-NEXT: s_bitset1_b32 s12, 22 +; GFX11-NEXT: s_addk_i32 s46, 0x7fff +; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s12, s12, s46 +; GFX11-NEXT: s_and_b32 s46, s11, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s12, s12, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s72, s12, s45 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s46, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s47, s46 +; GFX11-NEXT: s_bitset1_b32 s46, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s46, s46, s47 +; GFX11-NEXT: s_lshl_b32 s11, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s46, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-NEXT: v_writelane_b32 v20, s75, 10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s47, s11, 0x10010 +; GFX11-NEXT: s_add_i32 s46, s47, s11 +; GFX11-NEXT: s_bitset1_b32 s11, 22 +; GFX11-NEXT: s_addk_i32 s46, 0x7fff +; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s11, s11, s46 +; GFX11-NEXT: s_and_b32 s46, s10, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s11, s11, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s46 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s46, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s47, s46 +; GFX11-NEXT: s_bitset1_b32 s46, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s46, s46, s47 +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_lshr_b32 s46, s46, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s47, s10, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s47, s10 +; GFX11-NEXT: s_bitset1_b32 s10, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s10, s10, s47 +; GFX11-NEXT: s_and_b32 s47, s9, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s10, s10, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_lshr_b32 s78, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-NEXT: v_writelane_b32 v20, s78, 11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s9, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s9 +; GFX11-NEXT: s_bitset1_b32 s9, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s9, s9, s47 +; GFX11-NEXT: s_and_b32 s47, s8, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s9, s9, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s8, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s8 +; GFX11-NEXT: s_bitset1_b32 s8, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s8, s8, s47 +; GFX11-NEXT: s_and_b32 s47, s7, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s8, s8, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: v_writelane_b32 v20, s79, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s7, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s7 +; GFX11-NEXT: s_bitset1_b32 s7, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s7, s7, s47 +; GFX11-NEXT: s_and_b32 s47, s6, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s7, s7, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s45, s7, s79 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_lshr_b32 s59, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s6, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s6 +; GFX11-NEXT: s_bitset1_b32 s6, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s6, s6, s47 +; GFX11-NEXT: s_and_b32 s47, s5, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshr_b32 s92, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-NEXT: v_writelane_b32 v20, s92, 13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s5, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s5 +; GFX11-NEXT: s_bitset1_b32 s5, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s5, s5, s47 +; GFX11-NEXT: s_and_b32 s47, s4, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s5, s5, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_lshr_b32 s60, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s4, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s4 +; GFX11-NEXT: s_bitset1_b32 s4, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, s4, s47 +; GFX11-NEXT: s_and_b32 s47, s1, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshr_b32 s93, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_writelane_b32 v20, s93, 14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s1, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s1 +; GFX11-NEXT: s_bitset1_b32 s1, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s1, s1, s47 +; GFX11-NEXT: s_and_b32 s47, s0, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s95, s1, s93 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshr_b32 s61, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s0, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s0 +; GFX11-NEXT: s_bitset1_b32 s0, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s0, s47 +; GFX11-NEXT: s_and_b32 s47, s3, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s94, s0, s61 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s34, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: v_writelane_b32 v20, s34, 15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s3, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s3 +; GFX11-NEXT: s_bitset1_b32 s3, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s3, s3, s47 +; GFX11-NEXT: s_and_b32 s47, s2, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s93, s3, s34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshr_b32 s62, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s2, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s2 +; GFX11-NEXT: s_bitset1_b32 s2, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s47 +; GFX11-NEXT: s_and_b32 s47, s17, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: s_lshr_b32 s35, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s17 +; GFX11-NEXT: v_writelane_b32 v20, s35, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s17, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s17, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s17 +; GFX11-NEXT: s_bitset1_b32 s17, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s17, s17, s47 +; GFX11-NEXT: s_and_b32 s47, s16, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s17, s17, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s79, s17, s35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_lshr_b32 s63, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s16, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s16 +; GFX11-NEXT: s_bitset1_b32 s16, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s16, s16, s47 +; GFX11-NEXT: s_and_b32 s47, s19, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s19, s19, 16 +; GFX11-NEXT: s_lshr_b32 s36, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s19 +; GFX11-NEXT: v_writelane_b32 v20, s36, 17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s19, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s19 +; GFX11-NEXT: s_bitset1_b32 s19, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s19, s19, s47 +; GFX11-NEXT: s_and_b32 s47, s18, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s19, s19, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-NEXT: s_lshr_b32 s74, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s18, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s18, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s18 +; GFX11-NEXT: s_bitset1_b32 s18, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s18, s18, s47 +; GFX11-NEXT: s_and_b32 s47, s21, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s74, s18, s74 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-NEXT: s_lshr_b32 s37, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s21 +; GFX11-NEXT: v_writelane_b32 v20, s37, 18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s21, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s21, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s21 +; GFX11-NEXT: s_bitset1_b32 s21, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s21, s21, s47 +; GFX11-NEXT: s_and_b32 s47, s20, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s21, s21, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: s_lshr_b32 s90, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s20, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s20, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s20 +; GFX11-NEXT: s_bitset1_b32 s20, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s20, s20, s47 +; GFX11-NEXT: s_and_b32 s47, s23, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s20, s20, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s23, s23, 16 +; GFX11-NEXT: s_lshr_b32 s38, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s23 +; GFX11-NEXT: v_writelane_b32 v20, s38, 19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s23, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s23, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s23 +; GFX11-NEXT: s_bitset1_b32 s23, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s23, s23, s47 +; GFX11-NEXT: s_and_b32 s47, s22, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s23, s23, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s35, s23, s38 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s22, s22, 16 +; GFX11-NEXT: s_lshr_b32 s91, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s22, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s22, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s22 +; GFX11-NEXT: s_bitset1_b32 s22, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s22, s22, s47 +; GFX11-NEXT: s_and_b32 s47, s25, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s22, s22, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s34, s22, s91 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s25, s25, 16 +; GFX11-NEXT: s_lshr_b32 s39, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s25 +; GFX11-NEXT: v_writelane_b32 v20, s39, 20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s25, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s25, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s25 +; GFX11-NEXT: s_bitset1_b32 s25, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s25, s25, s47 +; GFX11-NEXT: s_and_b32 s47, s24, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s25, s25, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s56, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s47, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s57, s11, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s75, s19, s36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s24, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s56, s24, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s56, s24 +; GFX11-NEXT: s_bitset1_b32 s24, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s24, s24, s47 +; GFX11-NEXT: s_and_b32 s47, s27, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s24, s24, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s56, s10, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s46, s8, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s47, s9, s78 +; GFX11-NEXT: s_pack_ll_b32_b16 s78, s16, s63 +; GFX11-NEXT: v_readfirstlane_b32 s42, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_pack_ll_b32_b16 s63, s21, s37 +; GFX11-NEXT: s_pack_ll_b32_b16 s37, s25, s39 +; GFX11-NEXT: s_pack_ll_b32_b16 s36, s24, vcc_hi +; GFX11-NEXT: s_bfe_u32 s43, s42, 0x10010 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s43, s43, s42 +; GFX11-NEXT: s_bitset1_b32 s42, 22 +; GFX11-NEXT: s_addk_i32 s43, 0x7fff +; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s42, s42, s43 +; GFX11-NEXT: s_lshl_b32 s27, s27, 16 +; GFX11-NEXT: s_lshr_b32 s71, s42, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s44, s6, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s43, s5, s92 +; GFX11-NEXT: s_pack_ll_b32_b16 s92, s2, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s62, s20, s90 +; GFX11-NEXT: v_readfirstlane_b32 s27, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s58, s27, 0x10010 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s42, s58, s27 +; GFX11-NEXT: s_bitset1_b32 s27, 22 +; GFX11-NEXT: s_addk_i32 s42, 0x7fff +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s27, s27, s42 +; GFX11-NEXT: s_and_b32 s58, s26, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s27, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s42, s4, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s91, s27, s71 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s58, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX11-NEXT: s_add_i32 s59, s59, s58 +; GFX11-NEXT: s_bitset1_b32 s58, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s58, s58, s59 +; GFX11-NEXT: s_lshl_b32 s26, s26, 16 +; GFX11-NEXT: s_lshr_b32 s90, s58, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s26, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_bfe_u32 s59, s26, 0x10010 +; GFX11-NEXT: s_add_i32 s58, s59, s26 +; GFX11-NEXT: s_bitset1_b32 s26, 22 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s26, s26, s58 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[36:37], 24 +; GFX11-NEXT: s_lshr_b32 s50, s62, 16 +; GFX11-NEXT: v_writelane_b32 v20, s58, 2 +; GFX11-NEXT: s_lshr_b32 s81, s62, 8 +; GFX11-NEXT: s_lshr_b32 s85, s75, 24 +; GFX11-NEXT: s_lshr_b32 s60, s75, 8 +; GFX11-NEXT: s_lshr_b32 s61, s74, 16 +; GFX11-NEXT: v_writelane_b32 v20, s59, 3 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[34:35], 24 +; GFX11-NEXT: s_lshr_b32 s96, s74, 8 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 +; GFX11-NEXT: s_lshr_b32 s75, s42, 8 +; GFX11-NEXT: v_writelane_b32 v20, s58, 0 +; GFX11-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-NEXT: s_lshr_b32 s65, s73, 24 +; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90 +; GFX11-NEXT: s_lshr_b32 s82, s73, 8 +; GFX11-NEXT: v_writelane_b32 v20, s59, 1 +; GFX11-NEXT: s_lshr_b32 s58, s63, 24 +; GFX11-NEXT: s_lshr_b32 s59, s63, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 +; GFX11-NEXT: s_lshr_b32 s63, s93, 24 +; GFX11-NEXT: s_lshr_b32 s84, s72, 16 +; GFX11-NEXT: v_writelane_b32 v20, s63, 21 +; GFX11-NEXT: s_lshr_b32 s63, s93, 8 +; GFX11-NEXT: s_lshr_b32 s51, s72, 8 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 +; GFX11-NEXT: s_lshr_b32 s86, s77, 24 +; GFX11-NEXT: v_writelane_b32 v20, s63, 22 +; GFX11-NEXT: s_lshr_b32 s63, s92, 16 +; GFX11-NEXT: s_lshr_b32 s87, s77, 8 +; GFX11-NEXT: s_lshr_b32 s52, s76, 16 +; GFX11-NEXT: s_lshr_b32 s100, s76, 8 +; GFX11-NEXT: v_writelane_b32 v20, s63, 23 +; GFX11-NEXT: s_lshr_b32 s63, s92, 8 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 +; GFX11-NEXT: s_lshr_b32 s101, s89, 8 +; GFX11-NEXT: s_lshr_b32 s98, s79, 24 +; GFX11-NEXT: v_writelane_b32 v20, s63, 24 +; GFX11-NEXT: s_lshr_b32 s63, s95, 24 +; GFX11-NEXT: s_lshr_b32 s99, s79, 8 +; GFX11-NEXT: s_lshr_b32 s53, s78, 16 +; GFX11-NEXT: s_lshr_b32 s97, s78, 8 +; GFX11-NEXT: v_writelane_b32 v20, s63, 25 +; GFX11-NEXT: s_lshr_b32 s63, s95, 8 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 +; GFX11-NEXT: s_lshr_b32 s102, s94, 16 +; GFX11-NEXT: v_writelane_b32 v20, s63, 26 +; GFX11-NEXT: s_lshr_b32 s63, s43, 24 +; GFX11-NEXT: s_lshr_b32 s103, s94, 8 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 +; GFX11-NEXT: s_lshr_b32 s73, s91, 24 +; GFX11-NEXT: v_writelane_b32 v20, s63, 27 +; GFX11-NEXT: s_lshr_b32 s63, s43, 8 +; GFX11-NEXT: s_lshr_b32 s77, s91, 8 +; GFX11-NEXT: s_lshr_b32 s83, s90, 8 +; GFX11-NEXT: s_lshr_b32 s66, s37, 24 +; GFX11-NEXT: v_writelane_b32 v20, s63, 28 +; GFX11-NEXT: s_lshr_b32 s63, s42, 16 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24 +; GFX11-NEXT: s_lshr_b32 s67, s37, 8 +; GFX11-NEXT: s_lshr_b32 s68, s36, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 6 +; GFX11-NEXT: s_lshr_b32 s49, s36, 8 +; GFX11-NEXT: s_lshr_b32 s69, s35, 24 +; GFX11-NEXT: s_lshr_b32 s70, s35, 8 +; GFX11-NEXT: s_lshr_b32 s64, s34, 16 +; GFX11-NEXT: v_writelane_b32 v20, s43, 7 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24 +; GFX11-NEXT: s_lshr_b32 s80, s34, 8 +; GFX11-NEXT: s_lshr_b32 s79, s45, 24 +; GFX11-NEXT: s_lshr_b32 s93, s45, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 4 +; GFX11-NEXT: s_lshr_b32 s95, s44, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8 +; GFX11-NEXT: s_lshr_b32 s34, s47, 24 +; GFX11-NEXT: s_lshr_b32 s55, s47, 8 +; GFX11-NEXT: v_writelane_b32 v20, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s88, 16 +; GFX11-NEXT: s_lshr_b32 s42, s89, 24 +; GFX11-NEXT: s_lshr_b32 s35, s46, 16 +; GFX11-NEXT: s_lshr_b32 s36, s46, 8 +; GFX11-NEXT: v_writelane_b32 v20, s43, 29 +; GFX11-NEXT: s_lshr_b32 s43, s88, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 +; GFX11-NEXT: s_lshr_b32 s89, s90, 16 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 30 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 +; GFX11-NEXT: s_lshr_b32 s37, s57, 24 +; GFX11-NEXT: s_lshr_b32 s38, s57, 8 +; GFX11-NEXT: s_lshr_b32 s39, s56, 16 +; GFX11-NEXT: s_lshr_b32 s48, s56, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[56:57], 24 +; GFX11-NEXT: s_lshr_b32 s91, s31, 24 +; GFX11-NEXT: s_lshr_b32 s43, s31, 8 +; GFX11-NEXT: s_lshr_b32 s44, s30, 16 +; GFX11-NEXT: s_lshr_b32 s45, s30, 8 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[30:31], 24 +; GFX11-NEXT: v_writelane_b32 v20, s43, 31 +; GFX11-NEXT: .LBB91_5: ; %end +; GFX11-NEXT: s_lshl_b32 s47, s103, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_and_b32 s57, s102, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s47 +; GFX11-NEXT: s_lshl_b32 s47, s94, 8 +; GFX11-NEXT: v_readlane_b32 s43, v20, 26 +; GFX11-NEXT: s_or_b32 s47, s57, s47 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s47, s47, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s47 +; GFX11-NEXT: s_lshl_b32 s47, s43, 8 +; GFX11-NEXT: v_readlane_b32 s43, v20, 25 +; GFX11-NEXT: s_or_b32 s1, s1, s47 +; GFX11-NEXT: v_readlane_b32 s47, v20, 14 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s57, s43, 8 +; GFX11-NEXT: v_readlane_b32 s43, v20, 24 +; GFX11-NEXT: s_and_b32 s47, s47, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s47, s47, s57 +; GFX11-NEXT: v_readlane_b32 s103, v19, 7 +; GFX11-NEXT: s_lshl_b32 s47, s47, 16 +; GFX11-NEXT: v_readlane_b32 s102, v19, 6 +; GFX11-NEXT: s_or_b32 s1, s1, s47 +; GFX11-NEXT: s_lshl_b32 s47, s43, 8 +; GFX11-NEXT: v_readlane_b32 s43, v20, 23 +; GFX11-NEXT: s_or_b32 s2, s2, s47 +; GFX11-NEXT: s_lshl_b32 s47, s92, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_and_b32 s57, s43, 0xff +; GFX11-NEXT: v_readlane_b32 s43, v20, 22 +; GFX11-NEXT: s_or_b32 s47, s57, s47 +; GFX11-NEXT: s_lshl_b32 s0, s97, 8 +; GFX11-NEXT: s_lshl_b32 s47, s47, 16 +; GFX11-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s47 +; GFX11-NEXT: s_lshl_b32 s47, s43, 8 +; GFX11-NEXT: v_readlane_b32 s43, v20, 21 +; GFX11-NEXT: s_or_b32 s3, s3, s47 +; GFX11-NEXT: v_readlane_b32 s47, v20, 15 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshl_b32 s57, s43, 8 +; GFX11-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-NEXT: s_and_b32 s47, s47, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s47, s47, s57 +; GFX11-NEXT: s_and_b32 s16, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s47, s47, 16 +; GFX11-NEXT: v_readlane_b32 s97, v19, 1 +; GFX11-NEXT: s_or_b32 s3, s3, s47 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: s_and_b32 s2, s53, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s98, 8 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_lshl_b32 s2, s99, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_readlane_b32 s99, v19, 3 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s17, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s85, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: v_readlane_b32 s2, v20, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: v_readlane_b32 s98, v19, 2 +; GFX11-NEXT: v_readlane_b32 s85, v18, 29 +; GFX11-NEXT: v_readlane_b32 s53, v18, 13 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: v_readlane_b32 s31, v18, 1 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s18, s69, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s96, 8 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshl_b32 s3, s74, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s3, s16, s3 +; GFX11-NEXT: s_lshl_b32 s16, s60, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s0, s81, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s19, 0xff +; GFX11-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: v_readlane_b32 s16, v20, 17 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshl_b32 s1, s62, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s73, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: s_and_b32 s2, s50, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s58, 8 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_lshl_b32 s2, s59, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_readlane_b32 s16, v20, 0 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-NEXT: v_readlane_b32 s17, v20, 1 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: v_readlane_b32 s2, v20, 18 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s70, 8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_readlane_b32 s70, v18, 22 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: v_readlane_b32 s69, v18, 21 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_readlane_b32 s50, v18, 10 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s80, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_and_b32 s16, s64, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s3, s16, s3 +; GFX11-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v20, 19 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s49, 8 +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s17, s17, s18 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: v_readlane_b32 s16, v20, 2 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: s_and_b32 s2, s68, 0xff +; GFX11-NEXT: v_readlane_b32 s17, v20, 3 +; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: v_readlane_b32 s16, v20, 20 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s67, 8 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s83, 8 +; GFX11-NEXT: s_and_b32 s16, s89, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s90, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s16, s27, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s77, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: s_and_b32 s0, s40, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s45, 8 +; GFX11-NEXT: s_and_b32 s2, s44, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s30, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v20, 31 +; GFX11-NEXT: s_and_b32 s2, s41, 0xff +; GFX11-NEXT: s_and_b32 s16, s104, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s91, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v20, 30 +; GFX11-NEXT: v_readlane_b32 s16, v20, 29 +; GFX11-NEXT: s_and_b32 s2, s28, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s88, 8 +; GFX11-NEXT: s_and_b32 s18, s54, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s16, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s101, 8 +; GFX11-NEXT: s_lshl_b32 s19, s42, 8 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-NEXT: v_readlane_b32 s14, v20, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: s_lshl_b32 s1, s100, 8 +; GFX11-NEXT: s_and_b32 s2, s52, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s76, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s87, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s86, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s14, s15 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s51, 8 +; GFX11-NEXT: s_and_b32 s12, s84, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s72, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s12, s14 +; GFX11-NEXT: v_readlane_b32 s14, v20, 9 +; GFX11-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s82, 8 +; GFX11-NEXT: s_lshl_b32 s15, s65, 8 +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s13, s14, s15 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-NEXT: v_readlane_b32 s10, v20, 10 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s12, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: s_lshl_b32 s1, s48, 8 +; GFX11-NEXT: s_and_b32 s2, s39, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s38, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s37, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s10, s11 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s36, 8 +; GFX11-NEXT: s_and_b32 s8, s35, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s46, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s8, s10 +; GFX11-NEXT: v_readlane_b32 s10, v20, 11 +; GFX11-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s55, 8 +; GFX11-NEXT: s_lshl_b32 s11, s34, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s8, s9 +; GFX11-NEXT: v_readlane_b32 s8, v20, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-NEXT: v_readlane_b32 s6, v20, 12 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: s_lshl_b32 s1, vcc_hi, 8 +; GFX11-NEXT: s_and_b32 s2, s95, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s93, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s79, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s6, s7 +; GFX11-NEXT: v_readlane_b32 s6, v20, 6 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s75, 8 +; GFX11-NEXT: s_and_b32 s4, s63, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: v_readlane_b32 s7, v20, 7 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s4, s6 +; GFX11-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-NEXT: v_readlane_b32 s5, v20, 28 +; GFX11-NEXT: v_readlane_b32 s6, v20, 13 +; GFX11-NEXT: v_readlane_b32 s7, v20, 27 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s9, v20, 5 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-NEXT: v_readlane_b32 s104, v19, 8 +; GFX11-NEXT: v_readlane_b32 s101, v19, 5 +; GFX11-NEXT: v_readlane_b32 s100, v19, 4 +; GFX11-NEXT: v_readlane_b32 s87, v18, 31 +; GFX11-NEXT: v_readlane_b32 s86, v18, 30 +; GFX11-NEXT: v_readlane_b32 s84, v18, 28 +; GFX11-NEXT: v_readlane_b32 s83, v18, 27 +; GFX11-NEXT: v_readlane_b32 s82, v18, 26 +; GFX11-NEXT: v_readlane_b32 s80, v18, 24 +; GFX11-NEXT: v_readlane_b32 s71, v18, 23 +; GFX11-NEXT: v_readlane_b32 s68, v18, 20 +; GFX11-NEXT: v_readlane_b32 s67, v18, 19 +; GFX11-NEXT: v_readlane_b32 s66, v18, 18 +; GFX11-NEXT: v_readlane_b32 s65, v18, 17 +; GFX11-NEXT: v_readlane_b32 s64, v18, 16 +; GFX11-NEXT: v_readlane_b32 s55, v18, 15 +; GFX11-NEXT: v_readlane_b32 s54, v18, 14 +; GFX11-NEXT: v_readlane_b32 s52, v18, 12 +; GFX11-NEXT: v_readlane_b32 s51, v18, 11 +; GFX11-NEXT: v_readlane_b32 s49, v18, 9 +; GFX11-NEXT: v_readlane_b32 s48, v18, 8 +; GFX11-NEXT: v_readlane_b32 s39, v18, 7 +; GFX11-NEXT: v_readlane_b32 s38, v18, 6 +; GFX11-NEXT: v_readlane_b32 s37, v18, 5 +; GFX11-NEXT: v_readlane_b32 s36, v18, 4 +; GFX11-NEXT: v_readlane_b32 s35, v18, 3 +; GFX11-NEXT: v_readlane_b32 s34, v18, 2 +; GFX11-NEXT: v_readlane_b32 s30, v18, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_clause 0x2 ; 12-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v18, off, s32 +; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -194740,8 +193223,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -194774,26 +193257,55 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_writelane_b32 v63, s84, 28 ; VI-NEXT: v_writelane_b32 v63, s85, 29 ; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s56, v3 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_readfirstlane_b32 s57, v4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_readfirstlane_b32 s46, v5 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_readfirstlane_b32 s47, v6 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_readfirstlane_b32 s44, v7 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_readfirstlane_b32 s45, v8 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_readfirstlane_b32 s42, v9 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_readfirstlane_b32 s43, v10 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_readfirstlane_b32 s40, v11 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_readfirstlane_b32 s25, v14 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_readfirstlane_b32 s22, v15 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_readfirstlane_b32 s23, v16 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s44, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s42, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s40, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s12, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s10, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s8, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_readfirstlane_b32 s20, v17 +; VI-NEXT: v_readfirstlane_b32 s21, v18 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[26:27], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -194812,471 +193324,497 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB95_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s41, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s40, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s43, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s42, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s45, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s44, 8 -; VI-NEXT: s_lshr_b32 s52, s7, 16 -; VI-NEXT: s_lshr_b32 s53, s6, 16 -; VI-NEXT: s_lshr_b32 s84, s9, 16 -; VI-NEXT: s_lshr_b32 s85, s8, 16 -; VI-NEXT: s_lshr_b32 s80, s11, 24 -; VI-NEXT: s_lshr_b32 s86, s11, 16 -; VI-NEXT: s_lshr_b32 s87, s10, 16 -; VI-NEXT: s_lshr_b32 s81, s13, 24 -; VI-NEXT: s_lshr_b32 s54, s13, 16 -; VI-NEXT: s_lshr_b32 s55, s12, 16 -; VI-NEXT: s_lshr_b32 s82, s15, 24 -; VI-NEXT: s_lshr_b32 s64, s15, 16 -; VI-NEXT: s_lshr_b32 s65, s14, 16 -; VI-NEXT: s_lshr_b32 s83, s41, 24 -; VI-NEXT: s_lshr_b32 s66, s41, 16 -; VI-NEXT: s_lshr_b32 s67, s40, 16 -; VI-NEXT: s_lshr_b32 s50, s43, 24 -; VI-NEXT: s_lshr_b32 s68, s43, 16 -; VI-NEXT: s_lshr_b32 s69, s42, 16 -; VI-NEXT: s_lshr_b32 s51, s45, 24 -; VI-NEXT: s_lshr_b32 s70, s45, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 26 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 15 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 57 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 14 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 25 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 13 +; VI-NEXT: s_lshr_b32 s26, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 12 +; VI-NEXT: s_lshr_b32 s26, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 24 +; VI-NEXT: s_lshr_b32 s26, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 11 +; VI-NEXT: s_lshr_b32 s26, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 10 +; VI-NEXT: s_lshr_b32 s26, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 23 +; VI-NEXT: s_lshr_b32 s26, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 9 +; VI-NEXT: s_lshr_b32 s26, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 8 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 22 +; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 7 +; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 6 +; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 48 +; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 2 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 1 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 0 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s47, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s46, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 28 +; VI-NEXT: s_lshr_b32 s26, s57, 8 +; VI-NEXT: s_lshr_b32 s86, s21, 16 +; VI-NEXT: s_lshr_b32 s87, s20, 16 +; VI-NEXT: s_lshr_b32 s50, s23, 16 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s25, 16 +; VI-NEXT: s_lshr_b32 s53, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s41, 24 +; VI-NEXT: s_lshr_b32 s54, s41, 16 +; VI-NEXT: s_lshr_b32 s55, s40, 16 +; VI-NEXT: s_lshr_b32 s82, s43, 24 +; VI-NEXT: s_lshr_b32 s64, s43, 16 +; VI-NEXT: s_lshr_b32 s65, s42, 16 +; VI-NEXT: s_lshr_b32 s83, s45, 24 +; VI-NEXT: s_lshr_b32 s66, s45, 16 +; VI-NEXT: s_lshr_b32 s67, s44, 16 +; VI-NEXT: s_lshr_b32 s84, s47, 24 +; VI-NEXT: s_lshr_b32 s68, s47, 16 +; VI-NEXT: s_lshr_b32 s69, s46, 16 +; VI-NEXT: s_lshr_b32 s85, s57, 24 +; VI-NEXT: s_lshr_b32 s70, s57, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 27 +; VI-NEXT: s_lshr_b32 s71, s56, 16 +; VI-NEXT: s_lshr_b32 s80, s56, 8 +; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: v_mov_b32_e32 v7, 0x200 -; VI-NEXT: v_add_f16_e32 v11, s46, v7 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; VI-NEXT: v_add_f16_e32 v2, s45, v7 -; VI-NEXT: s_lshr_b32 s45, s44, 16 +; VI-NEXT: s_lshr_b32 s26, s57, 16 +; VI-NEXT: v_mov_b32_e32 v9, 0x200 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s56, 16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s57, v9 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s47, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v23, v2, v1 -; VI-NEXT: v_add_f16_e32 v40, s45, v7 -; VI-NEXT: v_add_f16_e32 v2, s44, v7 -; VI-NEXT: s_lshr_b32 s44, s43, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; VI-NEXT: v_add_f16_e32 v43, s44, v7 +; VI-NEXT: v_or_b32_e32 v35, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s56, v9 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s46, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v22, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; VI-NEXT: v_add_f16_e32 v2, s43, v7 -; VI-NEXT: s_lshr_b32 s43, s42, 16 +; VI-NEXT: v_or_b32_e32 v34, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s47, v9 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s45, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v25, v2, v1 -; VI-NEXT: v_add_f16_e32 v54, s43, v7 -; VI-NEXT: v_add_f16_e32 v2, s42, v7 -; VI-NEXT: s_lshr_b32 s42, s41, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; VI-NEXT: v_add_f16_e32 v37, s42, v7 +; VI-NEXT: v_or_b32_e32 v23, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s46, v9 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s44, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v24, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; VI-NEXT: v_add_f16_e32 v2, s41, v7 -; VI-NEXT: s_lshr_b32 s41, s40, 16 +; VI-NEXT: v_or_b32_e32 v22, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s45, v9 +; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s43, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v27, v2, v1 -; VI-NEXT: v_add_f16_e32 v49, s41, v7 -; VI-NEXT: v_add_f16_e32 v2, s40, v7 -; VI-NEXT: s_lshr_b32 s40, s15, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; VI-NEXT: v_add_f16_e32 v52, s40, v7 +; VI-NEXT: v_or_b32_e32 v21, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, s44, v9 +; VI-NEXT: v_add_f16_e32 v58, s26, v9 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v26, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; VI-NEXT: v_add_f16_e32 v2, s15, v7 -; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: v_or_b32_e32 v20, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; VI-NEXT: v_add_f16_e32 v2, s43, v9 +; VI-NEXT: s_lshr_b32 s26, s42, 16 +; VI-NEXT: v_or_b32_e32 v25, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s41, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v29, v2, v1 -; VI-NEXT: v_add_f16_e32 v55, s15, v7 -; VI-NEXT: v_add_f16_e32 v2, s14, v7 -; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; VI-NEXT: v_add_f16_e32 v53, s14, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s42, v9 +; VI-NEXT: v_add_f16_e32 v46, s26, v9 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v28, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; VI-NEXT: v_add_f16_e32 v2, s13, v7 -; VI-NEXT: s_lshr_b32 s13, s12, 16 +; VI-NEXT: v_or_b32_e32 v24, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; VI-NEXT: v_add_f16_e32 v2, s41, v9 +; VI-NEXT: s_lshr_b32 s26, s40, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s26, v9 +; VI-NEXT: s_lshr_b32 s26, s25, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s13, v7 -; VI-NEXT: v_add_f16_e32 v2, s12, v7 -; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v39, s12, v7 +; VI-NEXT: v_add_f16_e32 v2, s40, v9 +; VI-NEXT: v_add_f16_e32 v47, s26, v9 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; VI-NEXT: v_add_f16_e32 v2, s11, v7 -; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; VI-NEXT: v_add_f16_e32 v2, s25, v9 +; VI-NEXT: s_lshr_b32 s25, s24, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v31, v2, v1 -; VI-NEXT: v_add_f16_e32 v60, s11, v7 -; VI-NEXT: v_add_f16_e32 v2, s10, v7 -; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 -; VI-NEXT: v_add_f16_e32 v48, s10, v7 +; VI-NEXT: v_or_b32_e32 v27, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s25, v9 +; VI-NEXT: v_add_f16_e32 v2, s24, v9 +; VI-NEXT: s_lshr_b32 s24, s23, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v44, s24, v9 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v30, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; VI-NEXT: v_add_f16_e32 v2, s9, v7 -; VI-NEXT: s_lshr_b32 s9, s8, 16 +; VI-NEXT: v_or_b32_e32 v26, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; VI-NEXT: v_add_f16_e32 v2, s23, v9 +; VI-NEXT: s_lshr_b32 s23, s22, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s9, v7 -; VI-NEXT: v_add_f16_e32 v2, s8, v7 -; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v1, s23, v9 +; VI-NEXT: v_add_f16_e32 v2, s22, v9 +; VI-NEXT: s_lshr_b32 s22, s21, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v50, s8, v7 +; VI-NEXT: v_add_f16_e32 v60, s22, v9 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; VI-NEXT: v_add_f16_e32 v2, s7, v7 -; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; VI-NEXT: v_add_f16_e32 v2, s21, v9 +; VI-NEXT: s_lshr_b32 s21, s20, 16 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s7, v7 -; VI-NEXT: v_add_f16_e32 v8, s6, v7 -; VI-NEXT: s_lshr_b32 s6, s17, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v1, s21, v9 +; VI-NEXT: v_add_f16_e32 v10, s20, v9 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v36, s6, v7 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v1, v8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; VI-NEXT: v_add_f16_e32 v9, s17, v7 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: v_or_b32_e32 v33, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s6, v7 -; VI-NEXT: s_lshr_b32 s6, s19, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s16, v7 -; VI-NEXT: v_add_f16_e32 v38, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v32, v9, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; VI-NEXT: v_add_f16_e32 v9, s19, v7 -; VI-NEXT: s_lshr_b32 s6, s18, 16 -; VI-NEXT: v_or_b32_e32 v21, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s6, v7 -; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s18, v7 -; VI-NEXT: v_add_f16_e32 v61, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v20, v9, v8 -; VI-NEXT: s_lshr_b32 s7, s20, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 -; VI-NEXT: v_add_f16_e32 v9, s21, v7 -; VI-NEXT: v_or_b32_e32 v35, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s6, s23, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s20, v7 -; VI-NEXT: v_add_f16_e32 v45, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v34, v9, v8 -; VI-NEXT: s_lshr_b32 s7, s22, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 -; VI-NEXT: v_add_f16_e32 v9, s23, v7 -; VI-NEXT: v_or_b32_e32 v19, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s22, v7 -; VI-NEXT: v_add_f16_e32 v47, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v18, v9, v8 -; VI-NEXT: s_lshr_b32 s7, s24, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; VI-NEXT: v_add_f16_e32 v9, s25, v7 -; VI-NEXT: v_or_b32_e32 v16, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s6, s27, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s24, v7 -; VI-NEXT: v_add_f16_e32 v57, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v15, v9, v8 -; VI-NEXT: s_lshr_b32 s7, s26, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; VI-NEXT: v_add_f16_e32 v9, s27, v7 -; VI-NEXT: v_or_b32_e32 v13, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s6, s29, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s26, v7 -; VI-NEXT: v_add_f16_e32 v59, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v12, v9, v8 -; VI-NEXT: s_lshr_b32 s7, s28, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; VI-NEXT: v_add_f16_e32 v9, s29, v7 -; VI-NEXT: v_or_b32_e32 v10, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: v_add_f16_e32 v43, s20, v9 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; VI-NEXT: v_add_f16_e32 v51, s19, v9 +; VI-NEXT: s_lshr_b32 s19, s18, 16 +; VI-NEXT: v_or_b32_e32 v29, v51, v10 +; VI-NEXT: v_add_f16_e32 v10, s19, v9 +; VI-NEXT: v_add_f16_e32 v54, s18, v9 +; VI-NEXT: s_lshr_b32 s18, s17, 16 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_add_f16_e32 v45, s18, v9 +; VI-NEXT: v_add_f16_e32 v11, s17, v9 +; VI-NEXT: s_lshr_b32 s17, s16, 16 +; VI-NEXT: v_or_b32_e32 v28, v54, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v45 +; VI-NEXT: v_add_f16_e32 v36, s17, v9 +; VI-NEXT: v_add_f16_e32 v55, s16, v9 +; VI-NEXT: s_lshr_b32 s16, s15, 16 +; VI-NEXT: v_or_b32_e32 v19, v11, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_add_f16_e32 v52, s16, v9 +; VI-NEXT: v_or_b32_e32 v18, v55, v10 +; VI-NEXT: s_lshr_b32 s17, s14, 16 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 +; VI-NEXT: v_add_f16_e32 v13, s15, v9 +; VI-NEXT: v_or_b32_e32 v31, v13, v10 +; VI-NEXT: v_add_f16_e32 v10, s17, v9 +; VI-NEXT: v_add_f16_e32 v59, s14, v9 +; VI-NEXT: s_lshr_b32 s14, s13, 16 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_lshr_b32 s15, s12, 16 +; VI-NEXT: v_add_f16_e32 v50, s14, v9 +; VI-NEXT: v_or_b32_e32 v30, v59, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; VI-NEXT: v_add_f16_e32 v53, s13, v9 +; VI-NEXT: v_add_f16_e32 v8, s15, v9 +; VI-NEXT: v_add_f16_e32 v41, s12, v9 +; VI-NEXT: s_lshr_b32 s12, s11, 16 +; VI-NEXT: v_or_b32_e32 v17, v53, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; VI-NEXT: s_lshr_b32 s13, s10, 16 +; VI-NEXT: v_add_f16_e32 v5, s12, v9 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v16, v41, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; VI-NEXT: v_add_f16_e32 v57, s11, v9 +; VI-NEXT: v_add_f16_e32 v38, s13, v9 +; VI-NEXT: v_add_f16_e32 v11, s10, v9 +; VI-NEXT: s_lshr_b32 s10, s9, 16 +; VI-NEXT: v_or_b32_e32 v33, v57, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; VI-NEXT: s_lshr_b32 s11, s8, 16 +; VI-NEXT: v_add_f16_e32 v37, s10, v9 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v11, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; VI-NEXT: v_add_f16_e32 v61, s9, v9 +; VI-NEXT: v_add_f16_e32 v48, s11, v9 +; VI-NEXT: v_add_f16_e32 v11, s8, v9 +; VI-NEXT: s_lshr_b32 s8, s7, 16 +; VI-NEXT: v_or_b32_e32 v15, v61, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_add_f16_e32 v49, s8, v9 +; VI-NEXT: v_or_b32_e32 v14, v11, v10 +; VI-NEXT: s_lshr_b32 s9, s6, 16 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; VI-NEXT: v_add_f16_e32 v56, s7, v9 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v56, v10 +; VI-NEXT: v_add_f16_e32 v10, s9, v9 +; VI-NEXT: v_add_f16_e32 v11, s6, v9 ; VI-NEXT: s_lshr_b32 s6, s5, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_add_f16_e32 v9, s28, v7 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: s_lshr_b32 s7, s4, 16 -; VI-NEXT: v_add_f16_e32 v51, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v9, v8 -; VI-NEXT: v_add_f16_e32 v8, s5, v7 -; VI-NEXT: v_add_f16_e32 v14, s7, v7 -; VI-NEXT: v_add_f16_e32 v17, s4, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v8, v8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v17, v7 -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v2 -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] +; VI-NEXT: v_add_f16_e32 v40, s6, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v11, v10 +; VI-NEXT: v_add_f16_e32 v10, s5, v9 +; VI-NEXT: v_add_f16_e32 v0, s7, v9 +; VI-NEXT: v_add_f16_e32 v42, s4, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v10, v10, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v42, v9 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[1:2] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[3:4] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[24:25] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[32:33] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[16:17] +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[30:31] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 -; VI-NEXT: v_mov_b32_e32 v50, v11 -; VI-NEXT: v_bfe_u32 v11, v48, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v11, v39, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[26:27] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v11, v53, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[24:25] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 -; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 -; VI-NEXT: v_bfe_u32 v27, v59, 8, 8 -; VI-NEXT: v_bfe_u32 v6, v57, 8, 8 -; VI-NEXT: v_bfe_u32 v12, v47, 8, 8 -; VI-NEXT: v_bfe_u32 v15, v45, 8, 8 -; VI-NEXT: v_bfe_u32 v1, v61, 8, 8 -; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 -; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v44, v37, 8, 8 -; VI-NEXT: v_bfe_u32 v11, v43, 8, 8 -; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[28:29] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v22 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[20:21] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[22:23] +; VI-NEXT: v_bfe_u32 v0, v60, 8, 8 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v40, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v42, v40 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[34:35] +; VI-NEXT: v_mov_b32_e32 v4, v5 +; VI-NEXT: v_bfe_u32 v27, v5, 8, 8 +; VI-NEXT: v_mov_b32_e32 v5, v50 +; VI-NEXT: v_bfe_u32 v1, v50, 8, 8 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v25, v52 +; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 +; VI-NEXT: v_bfe_u32 v34, v36, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 +; VI-NEXT: v_mov_b32_e32 v44, v58 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v46, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v44, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v28 +; VI-NEXT: v_mov_b32_e32 v2, v54 +; VI-NEXT: v_bfe_u32 v21, v49, 8, 8 +; VI-NEXT: v_bfe_u32 v54, v37, 8, 8 +; VI-NEXT: v_bfe_u32 v28, v45, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v43, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_bfe_u32 v34, v0, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_bfe_u32 v34, v40, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_bfe_u32 v34, v60, 8, 8 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr85 ; VI-NEXT: ; implicit-def: $sgpr69 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr84 ; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr83 @@ -195286,20 +193824,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr84 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr36 @@ -195308,388 +193845,399 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: -; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: v_mov_b32_e32 v1, s56 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s45 +; VI-NEXT: v_mov_b32_e32 v1, s57 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_mov_b32_e32 v1, s46 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s43 +; VI-NEXT: v_mov_b32_e32 v1, s47 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s40 +; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s41 +; VI-NEXT: v_mov_b32_e32 v1, s45 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: v_mov_b32_e32 v1, s42 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: v_mov_b32_e32 v1, s43 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v1, s40 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v1, s41 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v1, s24 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v1, s25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v1, s22 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s23 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v1, s20 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: v_mov_b32_e32 v1, s65 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s85 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s53 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s52 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s51 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s87 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s86 ; VI-NEXT: v_readlane_b32 s6, v62, 0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 1 -; VI-NEXT: v_mov_b32_e32 v36, s6 +; VI-NEXT: v_mov_b32_e32 v43, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v52, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 3 -; VI-NEXT: v_mov_b32_e32 v38, s6 +; VI-NEXT: v_mov_b32_e32 v45, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 5 -; VI-NEXT: v_mov_b32_e32 v61, s6 +; VI-NEXT: v_mov_b32_e32 v25, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v8, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 7 -; VI-NEXT: v_mov_b32_e32 v45, s6 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v38, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 9 -; VI-NEXT: v_mov_b32_e32 v47, s6 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v48, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 11 -; VI-NEXT: v_mov_b32_e32 v57, s6 +; VI-NEXT: v_mov_b32_e32 v37, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 13 -; VI-NEXT: v_mov_b32_e32 v59, s6 +; VI-NEXT: v_mov_b32_e32 v49, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 14 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s84 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s83 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s82 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s81 ; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v22, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v29, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v28, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v23, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: v_mov_b32_e32 v25, s4 +; VI-NEXT: v_mov_b32_e32 v27, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v54, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s80 -; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_mov_b32_e32 v20, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s81 -; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_mov_b32_e32 v22, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s82 -; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: v_mov_b32_e32 v42, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: v_mov_b32_e32 v58, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v31, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v30, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v32, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v11, s90 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v11, s88 +; VI-NEXT: v_mov_b32_e32 v59, s14 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v58, s28 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s46 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s78 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: v_mov_b32_e32 v28, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: v_mov_b32_e32 v3, s88 -; VI-NEXT: v_readlane_b32 s6, v62, 15 -; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s71 -; VI-NEXT: v_mov_b32_e32 v50, s70 -; VI-NEXT: v_mov_b32_e32 v54, s69 -; VI-NEXT: v_mov_b32_e32 v43, s68 -; VI-NEXT: v_mov_b32_e32 v49, s67 -; VI-NEXT: v_mov_b32_e32 v37, s66 -; VI-NEXT: v_mov_b32_e32 v55, s65 -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v53, s54 -; VI-NEXT: v_mov_b32_e32 v60, s87 -; VI-NEXT: v_mov_b32_e32 v39, s86 -; VI-NEXT: v_mov_b32_e32 v48, s84 -; VI-NEXT: v_mov_b32_e32 v51, s6 -; VI-NEXT: v_mov_b32_e32 v44, s83 -; VI-NEXT: v_mov_b32_e32 v11, s50 -; VI-NEXT: v_mov_b32_e32 v26, s51 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_mov_b32_e32 v20, s76 -; VI-NEXT: v_mov_b32_e32 v19, s74 -; VI-NEXT: v_mov_b32_e32 v18, s72 -; VI-NEXT: v_mov_b32_e32 v17, s62 -; VI-NEXT: v_mov_b32_e32 v16, s60 -; VI-NEXT: v_mov_b32_e32 v13, s58 -; VI-NEXT: v_mov_b32_e32 v9, s56 -; VI-NEXT: v_mov_b32_e32 v3, s90 -; VI-NEXT: v_mov_b32_e32 v4, s30 -; VI-NEXT: v_mov_b32_e32 v5, s34 -; VI-NEXT: v_mov_b32_e32 v30, s36 -; VI-NEXT: v_mov_b32_e32 v24, s38 -; VI-NEXT: v_mov_b32_e32 v14, s48 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s34 +; VI-NEXT: v_mov_b32_e32 v11, s78 +; VI-NEXT: v_readlane_b32 s6, v62, 15 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v58, s26 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v51, s19 +; VI-NEXT: v_mov_b32_e32 v55, s16 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v41, s12 +; VI-NEXT: v_mov_b32_e32 v53, s13 +; VI-NEXT: v_mov_b32_e32 v57, s11 +; VI-NEXT: v_mov_b32_e32 v61, s9 +; VI-NEXT: v_mov_b32_e32 v56, s7 +; VI-NEXT: v_mov_b32_e32 v7, s71 +; VI-NEXT: v_mov_b32_e32 v60, s70 +; VI-NEXT: v_mov_b32_e32 v50, s69 +; VI-NEXT: v_mov_b32_e32 v40, s68 +; VI-NEXT: v_mov_b32_e32 v35, s67 +; VI-NEXT: v_mov_b32_e32 v0, s66 +; VI-NEXT: v_mov_b32_e32 v44, s64 +; VI-NEXT: v_mov_b32_e32 v46, s54 +; VI-NEXT: v_mov_b32_e32 v47, s52 +; VI-NEXT: v_mov_b32_e32 v36, s50 +; VI-NEXT: v_mov_b32_e32 v42, s6 +; VI-NEXT: v_mov_b32_e32 v34, s85 +; VI-NEXT: v_mov_b32_e32 v26, s80 +; VI-NEXT: v_mov_b32_e32 v24, s48 +; VI-NEXT: v_mov_b32_e32 v19, s38 +; VI-NEXT: v_mov_b32_e32 v6, s36 +; VI-NEXT: v_mov_b32_e32 v3, s30 +; VI-NEXT: v_mov_b32_e32 v18, s76 +; VI-NEXT: v_mov_b32_e32 v17, s74 +; VI-NEXT: v_mov_b32_e32 v16, s72 +; VI-NEXT: v_mov_b32_e32 v15, s62 +; VI-NEXT: v_mov_b32_e32 v14, s60 +; VI-NEXT: v_mov_b32_e32 v11, s58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v58, v2, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: v_or_b32_sdwa v29, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v15, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_or_b32_sdwa v14, v38, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v34 +; VI-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -195723,356 +194271,350 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v58, v23, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v22 -; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v58, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v31 +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v29, vcc, 4, v39 +; VI-NEXT: buffer_store_dword v18, v29, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v9 +; VI-NEXT: v_or_b32_sdwa v18, v55, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v39 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 +; VI-NEXT: v_or_b32_sdwa v18, v45, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v39 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v33 +; VI-NEXT: v_or_b32_sdwa v17, v59, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 16, v39 +; VI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v23 +; VI-NEXT: v_or_b32_sdwa v16, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v25, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 20, v39 +; VI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v12 +; VI-NEXT: v_or_b32_sdwa v16, v41, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 24, v39 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v11 +; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; VI-NEXT: v_or_b32_sdwa v11, v37, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v2 +; VI-NEXT: v_or_b32_sdwa v15, v53, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 28, v39 +; VI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 32, v39 +; VI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v27 +; VI-NEXT: v_or_b32_sdwa v14, v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 36, v39 +; VI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 40, v39 +; VI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 44, v39 +; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v4 +; VI-NEXT: v_or_b32_sdwa v11, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 48, v39 +; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v21 +; VI-NEXT: v_or_b32_sdwa v11, v49, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 52, v39 +; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 56, v39 +; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v20 +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 60, v39 +; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; VI-NEXT: v_or_b32_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 64, v39 +; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v19 +; VI-NEXT: v_or_b32_sdwa v7, v50, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x44, v39 +; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x48, v39 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; VI-NEXT: v_or_b32_sdwa v7, v40, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x4c, v39 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v39 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v39 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -196089,8 +194631,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -196137,26 +194679,54 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_writelane_b32 v63, s96, 32 ; GFX9-NEXT: v_writelane_b32 v63, s97, 33 ; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s56, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_readfirstlane_b32 s57, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_readfirstlane_b32 s46, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, s18 +; GFX9-NEXT: v_readfirstlane_b32 s47, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_readfirstlane_b32 s44, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s20 +; GFX9-NEXT: v_readfirstlane_b32 s45, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, s21 +; GFX9-NEXT: v_readfirstlane_b32 s42, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_readfirstlane_b32 s43, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_readfirstlane_b32 s40, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_readfirstlane_b32 s41, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-NEXT: v_readfirstlane_b32 s24, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s26 +; GFX9-NEXT: v_readfirstlane_b32 s25, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: v_readfirstlane_b32 s22, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_readfirstlane_b32 s23, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GFX9-NEXT: v_writelane_b32 v63, s99, 35 -; GFX9-NEXT: v_readfirstlane_b32 s44, v3 -; GFX9-NEXT: v_readfirstlane_b32 s45, v4 -; GFX9-NEXT: v_readfirstlane_b32 s42, v5 -; GFX9-NEXT: v_readfirstlane_b32 s43, v6 -; GFX9-NEXT: v_readfirstlane_b32 s40, v7 -; GFX9-NEXT: v_readfirstlane_b32 s41, v8 -; GFX9-NEXT: v_readfirstlane_b32 s14, v9 -; GFX9-NEXT: v_readfirstlane_b32 s15, v10 -; GFX9-NEXT: v_readfirstlane_b32 s12, v11 -; GFX9-NEXT: v_readfirstlane_b32 s13, v12 -; GFX9-NEXT: v_readfirstlane_b32 s10, v13 -; GFX9-NEXT: v_readfirstlane_b32 s11, v14 -; GFX9-NEXT: v_readfirstlane_b32 s8, v15 -; GFX9-NEXT: v_readfirstlane_b32 s9, v16 -; GFX9-NEXT: v_readfirstlane_b32 s6, v17 -; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v17 +; GFX9-NEXT: v_readfirstlane_b32 s21, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -196175,187 +194745,187 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s28, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s27, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s27, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s27, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s26, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s26, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s25, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s25, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s24, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s24, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s22, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s22, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s20, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s20, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s19, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s19, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s19, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s18, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s18, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s17, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s17, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s17, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s16, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 1 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 0 -; GFX9-NEXT: s_lshr_b32 s82, s11, 24 -; GFX9-NEXT: s_lshr_b32 s83, s11, 16 -; GFX9-NEXT: s_lshr_b32 s85, s11, 8 -; GFX9-NEXT: s_lshr_b32 s84, s10, 16 -; GFX9-NEXT: s_lshr_b32 s86, s10, 8 -; GFX9-NEXT: s_lshr_b32 s87, s13, 24 -; GFX9-NEXT: s_lshr_b32 s96, s13, 16 -; GFX9-NEXT: s_lshr_b32 s98, s13, 8 -; GFX9-NEXT: s_lshr_b32 s97, s12, 16 -; GFX9-NEXT: s_lshr_b32 s99, s12, 8 -; GFX9-NEXT: s_lshr_b32 s38, s15, 24 -; GFX9-NEXT: s_lshr_b32 s39, s15, 16 -; GFX9-NEXT: s_lshr_b32 s49, s15, 8 -; GFX9-NEXT: s_lshr_b32 s48, s14, 16 -; GFX9-NEXT: s_lshr_b32 s50, s14, 8 -; GFX9-NEXT: s_lshr_b32 s51, s41, 24 -; GFX9-NEXT: s_lshr_b32 s52, s41, 16 -; GFX9-NEXT: s_lshr_b32 s54, s41, 8 -; GFX9-NEXT: s_lshr_b32 s53, s40, 16 -; GFX9-NEXT: s_lshr_b32 s55, s40, 8 -; GFX9-NEXT: s_lshr_b32 s64, s43, 24 -; GFX9-NEXT: s_lshr_b32 s65, s43, 16 -; GFX9-NEXT: s_lshr_b32 s67, s43, 8 -; GFX9-NEXT: s_lshr_b32 s66, s42, 16 -; GFX9-NEXT: s_lshr_b32 s68, s42, 8 -; GFX9-NEXT: s_lshr_b32 s69, s45, 24 -; GFX9-NEXT: s_lshr_b32 s70, s45, 16 -; GFX9-NEXT: s_lshr_b32 s80, s45, 8 -; GFX9-NEXT: s_lshr_b32 s71, s44, 16 -; GFX9-NEXT: s_lshr_b32 s81, s44, 8 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 1 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b32 s82, s25, 24 +; GFX9-NEXT: s_lshr_b32 s83, s25, 16 +; GFX9-NEXT: s_lshr_b32 s85, s25, 8 +; GFX9-NEXT: s_lshr_b32 s84, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s98, s41, 8 +; GFX9-NEXT: s_lshr_b32 s97, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s49, s43, 8 +; GFX9-NEXT: s_lshr_b32 s48, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s54, s45, 8 +; GFX9-NEXT: s_lshr_b32 s53, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s67, s47, 8 +; GFX9-NEXT: s_lshr_b32 s66, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s80, s57, 8 +; GFX9-NEXT: s_lshr_b32 s71, s56, 16 +; GFX9-NEXT: s_lshr_b32 s81, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB95_4 ; GFX9-NEXT: .LBB95_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v15, 0x200 ; GFX9-NEXT: v_pk_add_f16 v26, s5, v15 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v25, s4, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, s45, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, s44, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, s43, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s42, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, s41, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, s40, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, s15, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, s14, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, s13, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, s12, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, s11, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, s10, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, s9, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, s8, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, s7, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, s6, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v49, s17, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v48, s16, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v38, s19, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v37, s18, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v36, s21, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v35, s20, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v34, s23, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v33, s22, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v32, s25, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v31, s24, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v30, s27, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v29, s26, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v28, s29, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v27, s28, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, s57, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, s56, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s47, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s46, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s45, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s44, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s43, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s42, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s41, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s40, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s25, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s24, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s23, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s22, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s21, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s20, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v49, s19, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v48, s18, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v38, s17, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, s16, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, s15, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v35, s14, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v34, s13, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v33, s12, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, s11, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, s10, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, s9, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, s8, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, s7, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, s6, v15 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -196554,10 +195124,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 ; GFX9-NEXT: s_branch .LBB95_5 ; GFX9-NEXT: .LBB95_3: -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr80 @@ -196594,7 +195164,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -196603,103 +195173,103 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB95_2 ; GFX9-NEXT: .LBB95_4: ; GFX9-NEXT: v_mov_b32_e32 v15, s71 @@ -196884,11 +195454,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v41, s4 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: v_mov_b32_e32 v41, s26 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: v_mov_b32_e32 v41, s28 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill @@ -196948,36 +195518,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: v_mov_b32_e32 v22, s45 -; GFX9-NEXT: v_mov_b32_e32 v13, s42 -; GFX9-NEXT: v_mov_b32_e32 v14, s43 -; GFX9-NEXT: v_mov_b32_e32 v11, s40 -; GFX9-NEXT: v_mov_b32_e32 v12, s41 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v48, s16 -; GFX9-NEXT: v_mov_b32_e32 v49, s17 -; GFX9-NEXT: v_mov_b32_e32 v37, s18 -; GFX9-NEXT: v_mov_b32_e32 v38, s19 -; GFX9-NEXT: v_mov_b32_e32 v35, s20 -; GFX9-NEXT: v_mov_b32_e32 v36, s21 -; GFX9-NEXT: v_mov_b32_e32 v33, s22 -; GFX9-NEXT: v_mov_b32_e32 v34, s23 -; GFX9-NEXT: v_mov_b32_e32 v31, s24 -; GFX9-NEXT: v_mov_b32_e32 v32, s25 -; GFX9-NEXT: v_mov_b32_e32 v29, s26 -; GFX9-NEXT: v_mov_b32_e32 v30, s27 -; GFX9-NEXT: v_mov_b32_e32 v27, s28 -; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s56 +; GFX9-NEXT: v_mov_b32_e32 v22, s57 +; GFX9-NEXT: v_mov_b32_e32 v13, s46 +; GFX9-NEXT: v_mov_b32_e32 v14, s47 +; GFX9-NEXT: v_mov_b32_e32 v11, s44 +; GFX9-NEXT: v_mov_b32_e32 v12, s45 +; GFX9-NEXT: v_mov_b32_e32 v9, s42 +; GFX9-NEXT: v_mov_b32_e32 v10, s43 +; GFX9-NEXT: v_mov_b32_e32 v7, s40 +; GFX9-NEXT: v_mov_b32_e32 v8, s41 +; GFX9-NEXT: v_mov_b32_e32 v5, s24 +; GFX9-NEXT: v_mov_b32_e32 v6, s25 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v48, s18 +; GFX9-NEXT: v_mov_b32_e32 v49, s19 +; GFX9-NEXT: v_mov_b32_e32 v37, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s17 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s15 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s13 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s11 +; GFX9-NEXT: v_mov_b32_e32 v29, s8 +; GFX9-NEXT: v_mov_b32_e32 v30, s9 +; GFX9-NEXT: v_mov_b32_e32 v27, s6 +; GFX9-NEXT: v_mov_b32_e32 v28, s7 ; GFX9-NEXT: v_mov_b32_e32 v26, s5 ; GFX9-NEXT: v_mov_b32_e32 v41, v50 ; GFX9-NEXT: v_mov_b32_e32 v50, v51 @@ -197415,33 +195985,41 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v75, s30, 0 ; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s40, v1 -; GFX11-NEXT: v_readfirstlane_b32 s41, v2 ; GFX11-NEXT: v_writelane_b32 v75, s31, 1 ; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v1 +; GFX11-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_readfirstlane_b32 s29, v2 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-NEXT: v_writelane_b32 v76, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-NEXT: v_writelane_b32 v76, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-NEXT: v_writelane_b32 v76, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-NEXT: v_writelane_b32 v76, s103, 7 ; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 @@ -197462,12 +196040,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 ; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 ; GFX11-NEXT: v_writelane_b32 v75, s48, 8 ; GFX11-NEXT: v_writelane_b32 v76, s104, 8 ; GFX11-NEXT: v_writelane_b32 v75, s49, 9 @@ -197552,19 +196126,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s96, s41, 24 -; GFX11-NEXT: s_lshr_b32 s97, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: s_lshr_b32 s96, s29, 24 +; GFX11-NEXT: s_lshr_b32 s97, s29, 16 +; GFX11-NEXT: s_lshr_b32 s100, s29, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s98, s40, 16 -; GFX11-NEXT: s_lshr_b32 s101, s40, 8 -; GFX11-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-NEXT: s_lshr_b32 s98, s28, 16 +; GFX11-NEXT: s_lshr_b32 s101, s28, 8 +; GFX11-NEXT: s_lshr_b32 s102, s41, 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s103, s29, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 8 -; GFX11-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-NEXT: s_lshr_b32 s103, s41, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s41, 8 +; GFX11-NEXT: s_lshr_b32 s104, s40, 16 ; GFX11-NEXT: v_writelane_b32 v78, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 @@ -197587,8 +196161,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -197638,7 +196212,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v78, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 ; GFX11-NEXT: v_writelane_b32 v78, s74, 0 ; GFX11-NEXT: v_writelane_b32 v78, s75, 1 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 @@ -197655,10 +196229,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v32, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s41 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s40 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s41 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s40 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s28 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] @@ -197918,8 +196492,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 ; GFX11-NEXT: v_readlane_b32 s0, v78, 2 ; GFX11-NEXT: v_mov_b32_e32 v71, s50 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 +; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v74, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 3 @@ -217260,60 +215834,88 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v20, s83, 27 ; VI-NEXT: v_writelane_b32 v20, s84, 28 ; VI-NEXT: v_writelane_b32 v20, s85, 29 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v20, s86, 30 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s43, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 -; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_readfirstlane_b32 s16, v5 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_mov_b32_e32 v6, s19 ; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_mov_b32_e32 v7, s20 ; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_mov_b32_e32 v8, s21 ; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_mov_b32_e32 v9, s22 ; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_mov_b32_e32 v10, s23 ; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_mov_b32_e32 v11, s24 ; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_mov_b32_e32 v12, s25 ; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_mov_b32_e32 v13, s26 ; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_mov_b32_e32 v14, s27 ; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_mov_b32_e32 v15, s28 ; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 ; VI-NEXT: v_readfirstlane_b32 s4, v17 ; VI-NEXT: v_readfirstlane_b32 s5, v18 -; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s28, v7 +; VI-NEXT: v_readfirstlane_b32 s29, v8 +; VI-NEXT: v_readfirstlane_b32 s26, v9 +; VI-NEXT: v_readfirstlane_b32 s27, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v11 +; VI-NEXT: v_readfirstlane_b32 s25, v12 +; VI-NEXT: v_readfirstlane_b32 s22, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s20, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s18, v1 ; VI-NEXT: s_and_b64 s[46:47], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s19, v2 ; VI-NEXT: v_writelane_b32 v20, s87, 31 ; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: s_lshr_b32 s46, s19, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: s_lshr_b32 s46, s19, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: s_lshr_b32 s46, s19, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: s_lshr_b32 s46, s18, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_lshr_b32 s46, s20, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: s_lshr_b32 s46, s23, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: s_lshr_b32 s46, s23, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s22, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s22, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 14 ; VI-NEXT: s_lshr_b32 s46, s25, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 15 @@ -217325,17 +215927,17 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v21, s46, 18 ; VI-NEXT: s_lshr_b32 s46, s24, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: s_lshr_b32 s46, s27, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: s_lshr_b32 s46, s27, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: s_lshr_b32 s46, s27, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s26, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s26, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s46, s29, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 25 ; VI-NEXT: s_lshr_b32 s46, s5, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 26 @@ -217397,58 +215999,50 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v21, s46, 54 ; VI-NEXT: s_lshr_b32 s46, s14, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: s_lshr_b32 s46, s17, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: s_lshr_b32 s80, s21, 16 -; VI-NEXT: s_lshr_b32 s82, s21, 8 -; VI-NEXT: s_lshr_b32 s84, s20, 16 -; VI-NEXT: s_lshr_b32 s86, s20, 8 -; VI-NEXT: s_lshr_b32 s51, s19, 24 -; VI-NEXT: s_lshr_b32 s53, s19, 16 -; VI-NEXT: s_lshr_b32 s54, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s66, s18, 8 -; VI-NEXT: s_lshr_b32 s67, s17, 24 -; VI-NEXT: s_lshr_b32 s68, s17, 16 -; VI-NEXT: s_lshr_b32 s69, s17, 8 -; VI-NEXT: s_lshr_b32 s70, s16, 16 -; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: s_lshr_b32 s80, s29, 16 +; VI-NEXT: s_lshr_b32 s82, s29, 8 +; VI-NEXT: s_lshr_b32 s84, s28, 16 +; VI-NEXT: s_lshr_b32 s86, s28, 8 +; VI-NEXT: s_lshr_b32 s51, s43, 24 +; VI-NEXT: s_lshr_b32 s53, s43, 16 +; VI-NEXT: s_lshr_b32 s54, s43, 8 +; VI-NEXT: s_lshr_b32 s65, s42, 16 +; VI-NEXT: s_lshr_b32 s66, s42, 8 +; VI-NEXT: s_lshr_b32 s67, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s69, s45, 8 +; VI-NEXT: s_lshr_b32 s70, s44, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s81, s41, 8 -; VI-NEXT: s_lshr_b32 s83, s40, 16 -; VI-NEXT: s_lshr_b32 s85, s40, 8 -; VI-NEXT: s_lshr_b32 s87, s43, 24 -; VI-NEXT: s_lshr_b32 s50, s43, 16 -; VI-NEXT: s_lshr_b32 s52, s43, 8 -; VI-NEXT: s_lshr_b32 s55, s42, 16 -; VI-NEXT: s_lshr_b32 s64, s42, 8 -; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b32 s81, s17, 8 +; VI-NEXT: s_lshr_b32 s83, s16, 16 +; VI-NEXT: s_lshr_b32 s85, s16, 8 +; VI-NEXT: s_lshr_b32 s87, s41, 24 +; VI-NEXT: s_lshr_b32 s50, s41, 16 +; VI-NEXT: s_lshr_b32 s52, s41, 8 +; VI-NEXT: s_lshr_b32 s55, s40, 16 +; VI-NEXT: s_lshr_b32 s64, s40, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 ; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[40:41], 24 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true -; VI-NEXT: s_and_b32 s46, s43, 0xffff0000 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s43, s43, 0xffff -; VI-NEXT: s_or_b32 s43, s46, s43 -; VI-NEXT: s_and_b32 s46, s42, 0xffff0000 -; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s42, s42, 0xffff -; VI-NEXT: s_or_b32 s42, s46, s42 ; VI-NEXT: s_and_b32 s46, s41, 0xffff0000 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_and_b32 s41, s41, 0xffff @@ -217457,6 +216051,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_and_b32 s40, s40, 0xffff ; VI-NEXT: s_or_b32 s40, s46, s40 +; VI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_or_b32 s17, s46, s17 +; VI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s16, s46, s16 ; VI-NEXT: s_and_b32 s46, s15, 0xffff0000 ; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_and_b32 s15, s15, 0xffff @@ -217505,106 +216107,106 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 3 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_or_b32 s4, s46, s4 -; VI-NEXT: s_and_b32 s46, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_or_b32 s17, s46, s17 -; VI-NEXT: s_and_b32 s46, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_or_b32 s16, s46, s16 -; VI-NEXT: s_and_b32 s46, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_or_b32 s19, s46, s19 -; VI-NEXT: s_and_b32 s46, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_or_b32 s18, s46, s18 -; VI-NEXT: s_and_b32 s46, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_or_b32 s21, s46, s21 -; VI-NEXT: s_and_b32 s46, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_or_b32 s20, s46, s20 -; VI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_or_b32 s23, s46, s23 -; VI-NEXT: s_and_b32 s46, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_or_b32 s22, s46, s22 -; VI-NEXT: s_and_b32 s46, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_or_b32 s25, s46, s25 -; VI-NEXT: s_and_b32 s46, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_or_b32 s24, s46, s24 -; VI-NEXT: s_and_b32 s46, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_or_b32 s27, s46, s27 -; VI-NEXT: s_and_b32 s46, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_or_b32 s26, s46, s26 -; VI-NEXT: s_and_b32 s46, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_or_b32 s29, s46, s29 -; VI-NEXT: s_and_b32 s46, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_or_b32 s28, s46, s28 ; VI-NEXT: s_and_b32 s46, s45, 0xffff0000 ; VI-NEXT: s_add_i32 s45, s45, 3 ; VI-NEXT: s_and_b32 s45, s45, 0xffff ; VI-NEXT: s_or_b32 s45, s46, s45 ; VI-NEXT: s_and_b32 s46, s44, 0xffff0000 ; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_add_i32 s45, s45, 0x30000 ; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_or_b32 s44, s46, s44 -; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: s_and_b32 s46, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_or_b32 s43, s46, s43 +; VI-NEXT: s_and_b32 s46, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_or_b32 s42, s46, s42 +; VI-NEXT: s_and_b32 s46, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_or_b32 s29, s46, s29 +; VI-NEXT: s_and_b32 s46, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_or_b32 s28, s46, s28 +; VI-NEXT: s_and_b32 s46, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_or_b32 s27, s46, s27 +; VI-NEXT: s_and_b32 s46, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_or_b32 s26, s46, s26 +; VI-NEXT: s_and_b32 s46, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_or_b32 s25, s46, s25 +; VI-NEXT: s_and_b32 s46, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_or_b32 s24, s46, s24 +; VI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_or_b32 s23, s46, s23 +; VI-NEXT: s_and_b32 s46, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_or_b32 s22, s46, s22 +; VI-NEXT: s_and_b32 s46, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_or_b32 s21, s46, s21 +; VI-NEXT: s_and_b32 s46, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_or_b32 s20, s46, s20 +; VI-NEXT: s_and_b32 s46, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_or_b32 s19, s46, s19 +; VI-NEXT: s_and_b32 s46, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_or_b32 s18, s46, s18 +; VI-NEXT: s_lshr_b32 s46, s19, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s45, 16 -; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: s_lshr_b32 s46, s19, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s44, 16 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: s_lshr_b32 s46, s18, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: s_lshr_b32 s46, s23, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s22, 16 ; VI-NEXT: s_add_i32 s25, s25, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s22, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 14 ; VI-NEXT: s_lshr_b32 s46, s25, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 15 @@ -217614,24 +216216,24 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s46, s25, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 17 ; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 18 ; VI-NEXT: s_lshr_b32 s46, s24, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: s_lshr_b32 s46, s27, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: s_lshr_b32 s46, s27, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s26, 8 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s46, s29, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 25 ; VI-NEXT: s_lshr_b32 s46, s5, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 26 @@ -217701,313 +216303,313 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s46, s15, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 53 ; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 54 ; VI-NEXT: s_lshr_b32 s46, s14, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s41, 24 -; VI-NEXT: s_add_i32 s43, s43, 0x30000 -; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 ; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s41, 16 -; VI-NEXT: s_lshr_b32 s80, s21, 16 -; VI-NEXT: s_lshr_b32 s82, s21, 8 -; VI-NEXT: s_lshr_b32 s84, s20, 16 -; VI-NEXT: s_lshr_b32 s86, s20, 8 -; VI-NEXT: s_lshr_b32 s51, s19, 24 -; VI-NEXT: s_lshr_b32 s53, s19, 16 -; VI-NEXT: s_lshr_b32 s54, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s66, s18, 8 -; VI-NEXT: s_lshr_b32 s67, s17, 24 -; VI-NEXT: s_lshr_b32 s68, s17, 16 -; VI-NEXT: s_lshr_b32 s69, s17, 8 -; VI-NEXT: s_lshr_b32 s70, s16, 16 -; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: s_lshr_b32 s80, s29, 16 +; VI-NEXT: s_lshr_b32 s82, s29, 8 +; VI-NEXT: s_lshr_b32 s84, s28, 16 +; VI-NEXT: s_lshr_b32 s86, s28, 8 +; VI-NEXT: s_lshr_b32 s51, s43, 24 +; VI-NEXT: s_lshr_b32 s53, s43, 16 +; VI-NEXT: s_lshr_b32 s54, s43, 8 +; VI-NEXT: s_lshr_b32 s65, s42, 16 +; VI-NEXT: s_lshr_b32 s66, s42, 8 +; VI-NEXT: s_lshr_b32 s67, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s69, s45, 8 +; VI-NEXT: s_lshr_b32 s70, s44, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s81, s41, 8 -; VI-NEXT: s_lshr_b32 s83, s40, 16 -; VI-NEXT: s_lshr_b32 s85, s40, 8 -; VI-NEXT: s_lshr_b32 s87, s43, 24 -; VI-NEXT: s_lshr_b32 s50, s43, 16 -; VI-NEXT: s_lshr_b32 s52, s43, 8 -; VI-NEXT: s_lshr_b32 s55, s42, 16 -; VI-NEXT: s_lshr_b32 s64, s42, 8 -; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b32 s81, s17, 8 +; VI-NEXT: s_lshr_b32 s83, s16, 16 +; VI-NEXT: s_lshr_b32 s85, s16, 8 +; VI-NEXT: s_lshr_b32 s87, s41, 24 +; VI-NEXT: s_lshr_b32 s50, s41, 16 +; VI-NEXT: s_lshr_b32 s52, s41, 8 +; VI-NEXT: s_lshr_b32 s55, s40, 16 +; VI-NEXT: s_lshr_b32 s64, s40, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 ; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 ; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 ; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[40:41], 24 ; VI-NEXT: .LBB99_3: ; %end ; VI-NEXT: s_lshl_b32 s47, s71, 8 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_or_b32 s16, s16, s47 +; VI-NEXT: s_and_b32 s44, s44, 0xff +; VI-NEXT: s_or_b32 s44, s44, s47 ; VI-NEXT: s_lshl_b32 s47, s48, 8 ; VI-NEXT: s_and_b32 s57, s70, 0xff ; VI-NEXT: s_or_b32 s47, s57, s47 -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_lshl_b32 s47, s47, 16 -; VI-NEXT: s_or_b32 s16, s16, s47 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: s_lshl_b32 s17, s69, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s68, 0xff +; VI-NEXT: s_or_b32 s44, s44, s47 +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: s_and_b32 s44, s45, 0xff +; VI-NEXT: s_lshl_b32 s45, s69, 8 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, s68, 0xff ; VI-NEXT: s_lshl_b32 s47, s67, 8 -; VI-NEXT: s_or_b32 s17, s17, s47 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_lshl_b32 s16, s66, 8 -; VI-NEXT: s_and_b32 s17, s18, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s38, 8 -; VI-NEXT: s_and_b32 s18, s65, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v3, s16 -; VI-NEXT: s_and_b32 s16, s19, 0xff -; VI-NEXT: s_lshl_b32 s17, s54, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s53, 0xff -; VI-NEXT: s_lshl_b32 s18, s51, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: s_lshl_b32 s16, s86, 8 -; VI-NEXT: s_and_b32 s17, s20, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s36, 8 -; VI-NEXT: s_and_b32 s18, s84, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v5, s16 -; VI-NEXT: s_and_b32 s16, s21, 0xff -; VI-NEXT: s_lshl_b32 s17, s82, 8 -; VI-NEXT: v_readlane_b32 s18, v21, 25 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s80, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v6, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 24 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s22, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 23 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s34, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 22 -; VI-NEXT: v_mov_b32_e32 v7, s16 -; VI-NEXT: s_and_b32 s16, s23, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 21 -; VI-NEXT: v_readlane_b32 s18, v21, 20 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 19 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s24, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 18 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s30, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 17 -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: s_and_b32 s16, s25, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 16 -; VI-NEXT: v_readlane_b32 s18, v21, 15 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v10, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 14 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s26, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 13 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s90, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 12 -; VI-NEXT: v_mov_b32_e32 v11, s16 -; VI-NEXT: s_and_b32 s16, s27, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 11 -; VI-NEXT: v_readlane_b32 s18, v21, 10 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v12, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 9 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s28, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 8 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s88, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_or_b32 s17, s18, s17 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 7 -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: s_and_b32 s16, s29, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 6 -; VI-NEXT: v_readlane_b32 s18, v21, 5 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 4 -; VI-NEXT: s_lshl_b32 s16, s16, 8 -; VI-NEXT: s_and_b32 s17, s44, 0xff -; VI-NEXT: v_readlane_b32 s18, v21, 3 -; VI-NEXT: s_or_b32 s16, s17, s16 -; VI-NEXT: s_lshl_b32 s17, s76, 8 +; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_lshl_b32 s45, s45, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: s_lshl_b32 s44, s66, 8 +; VI-NEXT: s_and_b32 s42, s42, 0xff +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_lshl_b32 s44, s38, 8 +; VI-NEXT: s_and_b32 s45, s65, 0xff +; VI-NEXT: s_or_b32 s44, s45, s44 +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_lshl_b32 s44, s44, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: v_mov_b32_e32 v3, s42 +; VI-NEXT: s_and_b32 s42, s43, 0xff +; VI-NEXT: s_lshl_b32 s43, s54, 8 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: s_and_b32 s43, s53, 0xff +; VI-NEXT: s_lshl_b32 s44, s51, 8 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_lshl_b32 s43, s43, 16 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: v_mov_b32_e32 v4, s42 +; VI-NEXT: s_lshl_b32 s42, s86, 8 +; VI-NEXT: s_and_b32 s28, s28, 0xff +; VI-NEXT: s_or_b32 s28, s28, s42 +; VI-NEXT: s_lshl_b32 s42, s36, 8 +; VI-NEXT: s_and_b32 s43, s84, 0xff +; VI-NEXT: s_or_b32 s42, s43, s42 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s42, s42, 16 +; VI-NEXT: s_or_b32 s28, s28, s42 +; VI-NEXT: v_mov_b32_e32 v5, s28 +; VI-NEXT: s_and_b32 s28, s29, 0xff +; VI-NEXT: s_lshl_b32 s29, s82, 8 +; VI-NEXT: v_readlane_b32 s42, v21, 25 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, s80, 0xff +; VI-NEXT: s_lshl_b32 s42, s42, 8 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s29, s29, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: v_mov_b32_e32 v6, s28 +; VI-NEXT: v_readlane_b32 s28, v21, 24 +; VI-NEXT: s_lshl_b32 s28, s28, 8 +; VI-NEXT: s_and_b32 s26, s26, 0xff +; VI-NEXT: v_readlane_b32 s29, v21, 23 +; VI-NEXT: s_or_b32 s26, s26, s28 +; VI-NEXT: s_lshl_b32 s28, s34, 8 +; VI-NEXT: s_and_b32 s29, s29, 0xff +; VI-NEXT: s_or_b32 s28, s29, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s28, s28, 16 +; VI-NEXT: s_or_b32 s26, s26, s28 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: s_and_b32 s26, s27, 0xff +; VI-NEXT: v_readlane_b32 s27, v21, 22 +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v21, 21 +; VI-NEXT: v_readlane_b32 s28, v21, 20 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s28, s28, 8 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_mov_b32_e32 v8, s26 +; VI-NEXT: v_readlane_b32 s26, v21, 19 +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: v_readlane_b32 s27, v21, 18 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_lshl_b32 s26, s30, 8 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: s_and_b32 s24, s25, 0xff +; VI-NEXT: v_readlane_b32 s25, v21, 17 +; VI-NEXT: s_lshl_b32 s25, s25, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_readlane_b32 s25, v21, 16 +; VI-NEXT: v_readlane_b32 s26, v21, 15 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_lshl_b32 s25, s25, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_readlane_b32 s24, v21, 14 +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: v_readlane_b32 s25, v21, 13 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_lshl_b32 s24, s90, 8 +; VI-NEXT: s_and_b32 s25, s25, 0xff +; VI-NEXT: s_or_b32 s24, s25, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s24, s24, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: s_and_b32 s22, s23, 0xff +; VI-NEXT: v_readlane_b32 s23, v21, 12 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_readlane_b32 s23, v21, 11 +; VI-NEXT: v_readlane_b32 s24, v21, 10 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s23, s23, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: v_mov_b32_e32 v12, s22 +; VI-NEXT: v_readlane_b32 s22, v21, 9 +; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: v_readlane_b32 s23, v21, 8 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_lshl_b32 s22, s88, 8 +; VI-NEXT: s_and_b32 s23, s23, 0xff +; VI-NEXT: s_or_b32 s22, s23, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: v_readlane_b32 s21, v21, 7 +; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: v_readlane_b32 s21, v21, 6 +; VI-NEXT: v_readlane_b32 s22, v21, 5 +; VI-NEXT: s_and_b32 s21, s21, 0xff +; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_readlane_b32 s20, v21, 4 +; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: v_readlane_b32 s21, v21, 3 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: s_lshl_b32 s20, s76, 8 +; VI-NEXT: s_and_b32 s21, s21, 0xff +; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s18, s18, s20 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 2 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: v_readlane_b32 s19, v21, 2 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: s_and_b32 s16, s45, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_lshl_b32 s19, s19, 8 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: v_readlane_b32 s17, v21, 1 -; VI-NEXT: v_readlane_b32 s18, v21, 0 +; VI-NEXT: v_readlane_b32 s19, v21, 1 +; VI-NEXT: v_readlane_b32 s20, v21, 0 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_and_b32 s19, s19, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s42, 0xff -; VI-NEXT: s_lshl_b32 s17, s64, 8 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s40, 0xff +; VI-NEXT: s_lshl_b32 s19, s64, 8 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s55, 0xff -; VI-NEXT: s_lshl_b32 s18, s78, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s55, 0xff +; VI-NEXT: s_lshl_b32 s20, s78, 8 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s43, 0xff -; VI-NEXT: s_lshl_b32 s17, s52, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s50, 0xff -; VI-NEXT: s_lshl_b32 s18, s87, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s41, 0xff +; VI-NEXT: s_lshl_b32 s19, s52, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s50, 0xff +; VI-NEXT: s_lshl_b32 s20, s87, 8 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s40, 0xff -; VI-NEXT: s_lshl_b32 s17, s85, 8 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, s83, 0xff -; VI-NEXT: s_lshl_b32 s18, s74, 8 -; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s85, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s83, 0xff +; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff -; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 -; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_and_b32 s16, s17, 0xff ; VI-NEXT: s_lshl_b32 s17, s81, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: v_readlane_b32 s17, v21, 57 @@ -218434,26 +217036,54 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_writelane_b32 v63, s96, 32 ; GFX9-NEXT: v_writelane_b32 v63, s97, 33 ; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s56, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_readfirstlane_b32 s57, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_readfirstlane_b32 s46, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, s18 +; GFX9-NEXT: v_readfirstlane_b32 s47, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_readfirstlane_b32 s44, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s20 +; GFX9-NEXT: v_readfirstlane_b32 s45, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, s21 +; GFX9-NEXT: v_readfirstlane_b32 s42, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_readfirstlane_b32 s43, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_readfirstlane_b32 s40, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_readfirstlane_b32 s41, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-NEXT: v_readfirstlane_b32 s24, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s26 +; GFX9-NEXT: v_readfirstlane_b32 s25, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: v_readfirstlane_b32 s22, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_readfirstlane_b32 s23, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GFX9-NEXT: v_writelane_b32 v63, s99, 35 -; GFX9-NEXT: v_readfirstlane_b32 s44, v3 -; GFX9-NEXT: v_readfirstlane_b32 s45, v4 -; GFX9-NEXT: v_readfirstlane_b32 s42, v5 -; GFX9-NEXT: v_readfirstlane_b32 s43, v6 -; GFX9-NEXT: v_readfirstlane_b32 s40, v7 -; GFX9-NEXT: v_readfirstlane_b32 s41, v8 -; GFX9-NEXT: v_readfirstlane_b32 s14, v9 -; GFX9-NEXT: v_readfirstlane_b32 s15, v10 -; GFX9-NEXT: v_readfirstlane_b32 s12, v11 -; GFX9-NEXT: v_readfirstlane_b32 s13, v12 -; GFX9-NEXT: v_readfirstlane_b32 s10, v13 -; GFX9-NEXT: v_readfirstlane_b32 s11, v14 -; GFX9-NEXT: v_readfirstlane_b32 s8, v15 -; GFX9-NEXT: v_readfirstlane_b32 s9, v16 -; GFX9-NEXT: v_readfirstlane_b32 s6, v17 -; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v17 +; GFX9-NEXT: v_readfirstlane_b32 s21, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v3 +; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s16, v5 +; GFX9-NEXT: v_readfirstlane_b32 s17, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -218472,225 +217102,225 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB99_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 49 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 48 -; GFX9-NEXT: s_lshr_b32 s46, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 47 -; GFX9-NEXT: s_lshr_b32 s46, s4, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 46 -; GFX9-NEXT: s_lshr_b32 s46, s4, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 45 -; GFX9-NEXT: s_lshr_b32 s46, s29, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 44 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 43 -; GFX9-NEXT: s_lshr_b32 s46, s29, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 42 -; GFX9-NEXT: s_lshr_b32 s46, s28, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 41 -; GFX9-NEXT: s_lshr_b32 s46, s28, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 40 -; GFX9-NEXT: s_lshr_b32 s46, s27, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s27, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s27, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s26, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s26, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s25, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s25, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s24, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s24, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 30 -; GFX9-NEXT: s_lshr_b32 s46, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 29 -; GFX9-NEXT: s_lshr_b32 s46, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 28 -; GFX9-NEXT: s_lshr_b32 s46, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 27 -; GFX9-NEXT: s_lshr_b32 s46, s22, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 26 -; GFX9-NEXT: s_lshr_b32 s46, s22, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 25 -; GFX9-NEXT: s_lshr_b32 s46, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 24 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 23 -; GFX9-NEXT: s_lshr_b32 s46, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 22 -; GFX9-NEXT: s_lshr_b32 s46, s20, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 21 -; GFX9-NEXT: s_lshr_b32 s46, s20, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 20 -; GFX9-NEXT: s_lshr_b32 s46, s19, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 19 -; GFX9-NEXT: s_lshr_b32 s46, s19, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 18 -; GFX9-NEXT: s_lshr_b32 s46, s19, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 17 -; GFX9-NEXT: s_lshr_b32 s46, s18, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 16 -; GFX9-NEXT: s_lshr_b32 s46, s18, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 15 -; GFX9-NEXT: s_lshr_b32 s46, s17, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 14 -; GFX9-NEXT: s_lshr_b32 s46, s17, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 13 -; GFX9-NEXT: s_lshr_b32 s46, s17, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 12 -; GFX9-NEXT: s_lshr_b32 s46, s16, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 11 -; GFX9-NEXT: s_lshr_b32 s46, s16, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s7, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s7, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s6, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s6, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 -; GFX9-NEXT: v_writelane_b32 v62, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 -; GFX9-NEXT: v_writelane_b32 v62, s46, 1 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 -; GFX9-NEXT: v_writelane_b32 v62, s46, 0 -; GFX9-NEXT: s_lshr_b32 s82, s11, 24 -; GFX9-NEXT: s_lshr_b32 s83, s11, 16 -; GFX9-NEXT: s_lshr_b32 s85, s11, 8 -; GFX9-NEXT: s_lshr_b32 s84, s10, 16 -; GFX9-NEXT: s_lshr_b32 s86, s10, 8 -; GFX9-NEXT: s_lshr_b32 s87, s13, 24 -; GFX9-NEXT: s_lshr_b32 s96, s13, 16 -; GFX9-NEXT: s_lshr_b32 s98, s13, 8 -; GFX9-NEXT: s_lshr_b32 s97, s12, 16 -; GFX9-NEXT: s_lshr_b32 s99, s12, 8 -; GFX9-NEXT: s_lshr_b32 s38, s15, 24 -; GFX9-NEXT: s_lshr_b32 s39, s15, 16 -; GFX9-NEXT: s_lshr_b32 s49, s15, 8 -; GFX9-NEXT: s_lshr_b32 s48, s14, 16 -; GFX9-NEXT: s_lshr_b32 s50, s14, 8 -; GFX9-NEXT: s_lshr_b32 s51, s41, 24 -; GFX9-NEXT: s_lshr_b32 s52, s41, 16 -; GFX9-NEXT: s_lshr_b32 s54, s41, 8 -; GFX9-NEXT: s_lshr_b32 s53, s40, 16 -; GFX9-NEXT: s_lshr_b32 s55, s40, 8 -; GFX9-NEXT: s_lshr_b32 s64, s43, 24 -; GFX9-NEXT: s_lshr_b32 s65, s43, 16 -; GFX9-NEXT: s_lshr_b32 s67, s43, 8 -; GFX9-NEXT: s_lshr_b32 s66, s42, 16 -; GFX9-NEXT: s_lshr_b32 s68, s42, 8 -; GFX9-NEXT: s_lshr_b32 s69, s45, 24 -; GFX9-NEXT: s_lshr_b32 s70, s45, 16 -; GFX9-NEXT: s_lshr_b32 s80, s45, 8 -; GFX9-NEXT: s_lshr_b32 s71, s44, 16 -; GFX9-NEXT: s_lshr_b32 s81, s44, 8 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 49 +; GFX9-NEXT: s_lshr_b32 s26, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 48 +; GFX9-NEXT: s_lshr_b32 s26, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 47 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 46 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 45 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 44 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 41 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 40 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 39 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 38 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 37 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 36 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 35 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 34 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 33 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 32 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 31 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 30 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 29 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 28 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 27 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 26 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 25 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 24 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 23 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 22 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 21 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 20 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 19 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 18 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 17 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 15 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 14 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 13 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 12 +; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 1 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b32 s82, s25, 24 +; GFX9-NEXT: s_lshr_b32 s83, s25, 16 +; GFX9-NEXT: s_lshr_b32 s85, s25, 8 +; GFX9-NEXT: s_lshr_b32 s84, s24, 16 +; GFX9-NEXT: s_lshr_b32 s86, s24, 8 +; GFX9-NEXT: s_lshr_b32 s87, s41, 24 +; GFX9-NEXT: s_lshr_b32 s96, s41, 16 +; GFX9-NEXT: s_lshr_b32 s98, s41, 8 +; GFX9-NEXT: s_lshr_b32 s97, s40, 16 +; GFX9-NEXT: s_lshr_b32 s99, s40, 8 +; GFX9-NEXT: s_lshr_b32 s38, s43, 24 +; GFX9-NEXT: s_lshr_b32 s39, s43, 16 +; GFX9-NEXT: s_lshr_b32 s49, s43, 8 +; GFX9-NEXT: s_lshr_b32 s48, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s51, s45, 24 +; GFX9-NEXT: s_lshr_b32 s52, s45, 16 +; GFX9-NEXT: s_lshr_b32 s54, s45, 8 +; GFX9-NEXT: s_lshr_b32 s53, s44, 16 +; GFX9-NEXT: s_lshr_b32 s55, s44, 8 +; GFX9-NEXT: s_lshr_b32 s64, s47, 24 +; GFX9-NEXT: s_lshr_b32 s65, s47, 16 +; GFX9-NEXT: s_lshr_b32 s67, s47, 8 +; GFX9-NEXT: s_lshr_b32 s66, s46, 16 +; GFX9-NEXT: s_lshr_b32 s68, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s80, s57, 8 +; GFX9-NEXT: s_lshr_b32 s71, s56, 16 +; GFX9-NEXT: s_lshr_b32 s81, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB99_4 ; GFX9-NEXT: .LBB99_2: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] -; GFX9-NEXT: v_pk_add_u16 v28, s29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v27, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] -; GFX9-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v29, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, s8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] -; GFX9-NEXT: v_pk_add_u16 v32, s25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v31, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, s10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] -; GFX9-NEXT: v_pk_add_u16 v34, s23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v33, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v34, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v33, s12, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] -; GFX9-NEXT: v_pk_add_u16 v36, s21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v35, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v35, s14, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] -; GFX9-NEXT: v_pk_add_u16 v38, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v37, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v38, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] -; GFX9-NEXT: v_pk_add_u16 v49, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v48, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v49, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v48, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] -; GFX9-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] -; GFX9-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s24, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] -; GFX9-NEXT: v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s41, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s40, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] -; GFX9-NEXT: v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s43, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s42, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -218698,8 +217328,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s45, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s44, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill @@ -218710,8 +217340,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, s47, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s46, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -218722,8 +217352,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 -; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, s57, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s56, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill @@ -218850,10 +217480,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 ; GFX9-NEXT: s_branch .LBB99_5 ; GFX9-NEXT: .LBB99_3: -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr80 @@ -218890,7 +217520,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -218899,103 +217529,103 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB99_2 ; GFX9-NEXT: .LBB99_4: ; GFX9-NEXT: v_mov_b32_e32 v15, s71 @@ -219180,11 +217810,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v41, s4 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: v_mov_b32_e32 v41, s26 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: v_mov_b32_e32 v41, s28 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill @@ -219244,36 +217874,36 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: v_mov_b32_e32 v22, s45 -; GFX9-NEXT: v_mov_b32_e32 v13, s42 -; GFX9-NEXT: v_mov_b32_e32 v14, s43 -; GFX9-NEXT: v_mov_b32_e32 v11, s40 -; GFX9-NEXT: v_mov_b32_e32 v12, s41 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v48, s16 -; GFX9-NEXT: v_mov_b32_e32 v49, s17 -; GFX9-NEXT: v_mov_b32_e32 v37, s18 -; GFX9-NEXT: v_mov_b32_e32 v38, s19 -; GFX9-NEXT: v_mov_b32_e32 v35, s20 -; GFX9-NEXT: v_mov_b32_e32 v36, s21 -; GFX9-NEXT: v_mov_b32_e32 v33, s22 -; GFX9-NEXT: v_mov_b32_e32 v34, s23 -; GFX9-NEXT: v_mov_b32_e32 v31, s24 -; GFX9-NEXT: v_mov_b32_e32 v32, s25 -; GFX9-NEXT: v_mov_b32_e32 v29, s26 -; GFX9-NEXT: v_mov_b32_e32 v30, s27 -; GFX9-NEXT: v_mov_b32_e32 v27, s28 -; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s56 +; GFX9-NEXT: v_mov_b32_e32 v22, s57 +; GFX9-NEXT: v_mov_b32_e32 v13, s46 +; GFX9-NEXT: v_mov_b32_e32 v14, s47 +; GFX9-NEXT: v_mov_b32_e32 v11, s44 +; GFX9-NEXT: v_mov_b32_e32 v12, s45 +; GFX9-NEXT: v_mov_b32_e32 v9, s42 +; GFX9-NEXT: v_mov_b32_e32 v10, s43 +; GFX9-NEXT: v_mov_b32_e32 v7, s40 +; GFX9-NEXT: v_mov_b32_e32 v8, s41 +; GFX9-NEXT: v_mov_b32_e32 v5, s24 +; GFX9-NEXT: v_mov_b32_e32 v6, s25 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v48, s18 +; GFX9-NEXT: v_mov_b32_e32 v49, s19 +; GFX9-NEXT: v_mov_b32_e32 v37, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s17 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s15 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s13 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s11 +; GFX9-NEXT: v_mov_b32_e32 v29, s8 +; GFX9-NEXT: v_mov_b32_e32 v30, s9 +; GFX9-NEXT: v_mov_b32_e32 v27, s6 +; GFX9-NEXT: v_mov_b32_e32 v28, s7 ; GFX9-NEXT: v_mov_b32_e32 v26, s5 ; GFX9-NEXT: v_mov_b32_e32 v41, v50 ; GFX9-NEXT: v_mov_b32_e32 v50, v51 @@ -219711,33 +218341,41 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v75, s30, 0 ; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_readfirstlane_b32 s40, v1 -; GFX11-NEXT: v_readfirstlane_b32 s41, v2 ; GFX11-NEXT: v_writelane_b32 v75, s31, 1 ; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s40, v16 +; GFX11-NEXT: v_readfirstlane_b32 s41, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v1 +; GFX11-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_readfirstlane_b32 s29, v2 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-NEXT: v_writelane_b32 v76, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-NEXT: v_writelane_b32 v76, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-NEXT: v_writelane_b32 v76, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-NEXT: v_writelane_b32 v76, s103, 7 ; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 @@ -219758,12 +218396,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 ; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 ; GFX11-NEXT: v_writelane_b32 v75, s48, 8 ; GFX11-NEXT: v_writelane_b32 v76, s104, 8 ; GFX11-NEXT: v_writelane_b32 v75, s49, 9 @@ -219848,19 +218482,19 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s96, s41, 24 -; GFX11-NEXT: s_lshr_b32 s97, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-NEXT: s_lshr_b32 s96, s29, 24 +; GFX11-NEXT: s_lshr_b32 s97, s29, 16 +; GFX11-NEXT: s_lshr_b32 s100, s29, 8 ; GFX11-NEXT: v_writelane_b32 v78, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s98, s40, 16 -; GFX11-NEXT: s_lshr_b32 s101, s40, 8 -; GFX11-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-NEXT: s_lshr_b32 s98, s28, 16 +; GFX11-NEXT: s_lshr_b32 s101, s28, 8 +; GFX11-NEXT: s_lshr_b32 s102, s41, 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s103, s29, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 8 -; GFX11-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-NEXT: s_lshr_b32 s103, s41, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s41, 8 +; GFX11-NEXT: s_lshr_b32 s104, s40, 16 ; GFX11-NEXT: v_writelane_b32 v78, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 @@ -219883,8 +218517,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 ; GFX11-NEXT: v_writelane_b32 v78, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -219934,7 +218568,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v78, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 ; GFX11-NEXT: v_writelane_b32 v78, s74, 0 ; GFX11-NEXT: v_writelane_b32 v78, s75, 1 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 @@ -219951,10 +218585,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v32, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s41, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, s40, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, s41, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s40, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s28, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] @@ -220214,8 +218848,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 ; GFX11-NEXT: v_readlane_b32 s0, v78, 2 ; GFX11-NEXT: v_mov_b32_e32 v71, s50 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 +; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v74, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 3 @@ -225623,674 +224257,724 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v42, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v32, v15 -; VI-NEXT: v_mov_b32_e32 v33, v13 -; VI-NEXT: v_mov_b32_e32 v34, v11 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v48, v15 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v11 +; VI-NEXT: v_mov_b32_e32 v51, v9 +; VI-NEXT: v_mov_b32_e32 v52, v7 +; VI-NEXT: v_mov_b32_e32 v53, v5 +; VI-NEXT: v_mov_b32_e32 v54, v3 +; VI-NEXT: v_mov_b32_e32 v55, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v48, v4 +; VI-NEXT: v_mov_b32_e32 v40, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: v_mov_b32_e32 v39, s17 +; VI-NEXT: v_mov_b32_e32 v38, s19 +; VI-NEXT: v_mov_b32_e32 v37, s21 +; VI-NEXT: v_mov_b32_e32 v36, s23 +; VI-NEXT: v_mov_b32_e32 v35, s25 +; VI-NEXT: v_mov_b32_e32 v34, s27 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB101_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: s_cbranch_execnz .LBB101_3 ; VI-NEXT: .LBB101_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v40, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v52, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v42, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v44, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v28, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v48, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v46, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v30, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v17 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v16, v3, v5, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v56, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v58, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v7, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v60, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v62, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v1, s4, v17 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v17 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v17 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v17 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v17 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v17 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v17 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v17 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v3, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc -; VI-NEXT: v_add_f32_e32 v9, s4, v17 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_add_f32_e32 v11, s4, v17 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; VI-NEXT: v_add_f32_e32 v9, s4, v17 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; VI-NEXT: v_mov_b32_e32 v5, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_add_f32_e32 v11, s4, v17 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v7, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v17 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_mov_b32_e32 v9, v18 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v17 -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v17 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v32, v19, v32, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v12 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_mov_b32_e32 v11, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_mov_b32_e32 v13, v18 -; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc -; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_mov_b32_e32 v15, v49 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] -; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_mov_b32_e32 v37, v48 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 -; VI-NEXT: v_mov_b32_e32 v35, v48 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] -; VI-NEXT: v_mov_b32_e32 v33, v48 -; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_and_b32_e32 v34, 0xffff0000, v55 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[56:57] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_mov_b32_e32 v39, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[58:59] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[60:61] +; VI-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc +; VI-NEXT: v_mov_b32_e32 v37, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[62:63] +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[32:33] +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[40:41] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] ; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] ; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] ; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] ; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] -; VI-NEXT: v_mov_b32_e32 v31, v50 -; VI-NEXT: s_branch .LBB101_5 -; VI-NEXT: .LBB101_3: -; VI-NEXT: s_branch .LBB101_2 -; VI-NEXT: .LBB101_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB101_5: ; %end -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v17, v38 -; VI-NEXT: v_mov_b32_e32 v18, v48 -; VI-NEXT: v_mov_b32_e32 v19, v37 -; VI-NEXT: v_mov_b32_e32 v21, v36 -; VI-NEXT: v_mov_b32_e32 v23, v35 -; VI-NEXT: v_mov_b32_e32 v25, v34 -; VI-NEXT: v_mov_b32_e32 v27, v33 -; VI-NEXT: v_mov_b32_e32 v29, v32 -; VI-NEXT: v_readlane_b32 s31, v42, 1 -; VI-NEXT: v_readlane_b32 s30, v42, 0 -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_mov_b32_e32 v54, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[52:53] +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[42:43] +; VI-NEXT: v_mov_b32_e32 v52, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[50:51] +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[44:45] +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[48:49] +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[46:47] +; VI-NEXT: v_mov_b32_e32 v31, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[15:16] +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: .LBB101_3: ; %end +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_mov_b32_e32 v18, v40 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v1, v39 +; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v9, v35 +; VI-NEXT: v_mov_b32_e32 v11, v34 +; VI-NEXT: v_mov_b32_e32 v15, v55 +; VI-NEXT: v_mov_b32_e32 v17, v54 +; VI-NEXT: v_mov_b32_e32 v19, v53 +; VI-NEXT: v_mov_b32_e32 v21, v52 +; VI-NEXT: v_mov_b32_e32 v23, v51 +; VI-NEXT: v_mov_b32_e32 v25, v50 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v29, v48 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_4: +; VI-NEXT: s_branch .LBB101_2 ; ; GFX9-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v31, v17 ; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 @@ -226307,658 +224991,669 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v32, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: s_cbranch_execnz .LBB101_3 ; GFX9-NEXT: .LBB101_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v16 ; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v33, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v33 -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 ; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 ; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v15, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc -; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v32 ; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 ; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 ; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v32 ; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc -; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 ; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v19 ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc -; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v20 ; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 ; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v20 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc -; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v21 ; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 ; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v21 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc -; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 ; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v22 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc -; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v23 ; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 ; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v23 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc -; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v24 ; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 ; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v24 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc -; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v25 ; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 ; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v25 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc -; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 ; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 ; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v26 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc -; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v27 ; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 ; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v27 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc -; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v28 ; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 ; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v28 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc -; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 ; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v29 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc -; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v30 ; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v30 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc -; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v31 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc -; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 ; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 ; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v0 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; GFX9-NEXT: v_lshl_or_b32 v31, v40, 16, v31 -; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 -; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 -; GFX9-NEXT: v_lshl_or_b32 v28, v53, 16, v28 -; GFX9-NEXT: v_lshl_or_b32 v27, v52, 16, v27 -; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v26 -; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 -; GFX9-NEXT: v_lshl_or_b32 v24, v49, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 -; GFX9-NEXT: v_lshl_or_b32 v22, v39, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v21 -; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 -; GFX9-NEXT: v_lshl_or_b32 v19, v36, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v32, v35, 16, v32 -; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 -; GFX9-NEXT: s_branch .LBB101_5 -; GFX9-NEXT: .LBB101_3: -; GFX9-NEXT: s_branch .LBB101_2 -; GFX9-NEXT: .LBB101_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB101_5: ; %end -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v41, v42, vcc +; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v1 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v42, v43, vcc +; GFX9-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v43, v43, v42 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v43, 0x7fff, v43 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v43, v43, v2 +; GFX9-NEXT: v_add_u32_e32 v43, 0x7fff, v43 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v43, v44, vcc +; GFX9-NEXT: v_and_b32_e32 v43, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v44, v44, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v44, 0x7fff, v44 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v44, v44, v3 +; GFX9-NEXT: v_add_u32_e32 v44, 0x7fff, v44 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_and_b32_e32 v44, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v45, v45, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v45, 0x7fff, v45 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v45, v45, v4 +; GFX9-NEXT: v_add_u32_e32 v45, 0x7fff, v45 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v46, v46, v45 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v46, 0x7fff, v46 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v46, v46, v5 +; GFX9-NEXT: v_add_u32_e32 v46, 0x7fff, v46 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_and_b32_e32 v46, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v47, v47, v46 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v47, 0x7fff, v47 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v47, v47, v6 +; GFX9-NEXT: v_add_u32_e32 v47, 0x7fff, v47 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_and_b32_e32 v47, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v56, v56, v47 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v56, 0x7fff, v56 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v56, v56, v7 +; GFX9-NEXT: v_add_u32_e32 v56, 0x7fff, v56 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_and_b32_e32 v56, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v57, v57, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v57, 0x7fff, v57 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v57, v57, v8 +; GFX9-NEXT: v_add_u32_e32 v57, 0x7fff, v57 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_and_b32_e32 v57, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v58, v58, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v58, 0x7fff, v58 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v58, v58, v9 +; GFX9-NEXT: v_add_u32_e32 v58, 0x7fff, v58 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_and_b32_e32 v58, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v59, v59, v58 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v59, 0x7fff, v59 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v59, v59, v10 +; GFX9-NEXT: v_add_u32_e32 v59, 0x7fff, v59 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_and_b32_e32 v59, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v60, v60, v59 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v60, 0x7fff, v60 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v60, v60, v11 +; GFX9-NEXT: v_add_u32_e32 v60, 0x7fff, v60 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_and_b32_e32 v60, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v61, v61, v60 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v61, 0x7fff, v61 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v61, v61, v12 +; GFX9-NEXT: v_add_u32_e32 v61, 0x7fff, v61 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_and_b32_e32 v61, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v61 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_mov_b32_e32 v1, v63 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v13 +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_and_b32_e32 v62, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v63, v0, vcc +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v63, v15, vcc +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v62, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_and_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v63, v15, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v61 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v60 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_and_b32_sdwa v1, v62, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_and_b32_sdwa v15, v62, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v53 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v51 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v50 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v48 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v37 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v15, 16, v16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_sdwa v16, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v17, v15, 16, v16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v15, v62, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v15 +; GFX9-NEXT: .LBB101_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v15, v63 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v18, v32 -; GFX9-NEXT: v_readlane_b32 s31, v43, 1 -; GFX9-NEXT: v_readlane_b32 s30, v43, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: s_branch .LBB101_2 ; ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -236097,674 +234792,724 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v42, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v32, v15 -; VI-NEXT: v_mov_b32_e32 v33, v13 -; VI-NEXT: v_mov_b32_e32 v34, v11 -; VI-NEXT: v_mov_b32_e32 v35, v9 -; VI-NEXT: v_mov_b32_e32 v36, v7 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v48, v15 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v11 +; VI-NEXT: v_mov_b32_e32 v51, v9 +; VI-NEXT: v_mov_b32_e32 v52, v7 +; VI-NEXT: v_mov_b32_e32 v53, v5 +; VI-NEXT: v_mov_b32_e32 v54, v3 +; VI-NEXT: v_mov_b32_e32 v55, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v48, v4 +; VI-NEXT: v_mov_b32_e32 v40, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: v_mov_b32_e32 v39, s17 +; VI-NEXT: v_mov_b32_e32 v38, s19 +; VI-NEXT: v_mov_b32_e32 v37, s21 +; VI-NEXT: v_mov_b32_e32 v36, s23 +; VI-NEXT: v_mov_b32_e32 v35, s25 +; VI-NEXT: v_mov_b32_e32 v34, s27 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB105_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: s_cbranch_execnz .LBB105_3 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v40, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v52, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v42, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v44, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v28, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v48, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v46, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v30, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v17 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v16, v3, v5, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v56, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v58, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v7, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v60, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v62, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v17 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v1, s4, v17 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v17 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v17 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v17 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v17 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v17 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v17 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v17 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v3, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc -; VI-NEXT: v_add_f32_e32 v9, s4, v17 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_add_f32_e32 v11, s4, v17 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; VI-NEXT: v_add_f32_e32 v9, s4, v17 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; VI-NEXT: v_mov_b32_e32 v5, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_add_f32_e32 v11, s4, v17 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v7, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v17 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; VI-NEXT: v_mov_b32_e32 v9, v18 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v17 -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v17 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v32, v19, v32, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v12 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_mov_b32_e32 v11, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_mov_b32_e32 v13, v18 -; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc -; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 -; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_mov_b32_e32 v15, v49 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 -; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] -; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_mov_b32_e32 v37, v48 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 -; VI-NEXT: v_mov_b32_e32 v35, v48 -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] -; VI-NEXT: v_mov_b32_e32 v33, v48 -; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_and_b32_e32 v34, 0xffff0000, v55 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[56:57] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_mov_b32_e32 v39, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[58:59] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[60:61] +; VI-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc +; VI-NEXT: v_mov_b32_e32 v37, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[62:63] +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[32:33] +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[40:41] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] ; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] ; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] ; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] ; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] -; VI-NEXT: v_mov_b32_e32 v31, v50 -; VI-NEXT: s_branch .LBB105_5 -; VI-NEXT: .LBB105_3: -; VI-NEXT: s_branch .LBB105_2 -; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB105_5: ; %end -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v17, v38 -; VI-NEXT: v_mov_b32_e32 v18, v48 -; VI-NEXT: v_mov_b32_e32 v19, v37 -; VI-NEXT: v_mov_b32_e32 v21, v36 -; VI-NEXT: v_mov_b32_e32 v23, v35 -; VI-NEXT: v_mov_b32_e32 v25, v34 -; VI-NEXT: v_mov_b32_e32 v27, v33 -; VI-NEXT: v_mov_b32_e32 v29, v32 -; VI-NEXT: v_readlane_b32 s31, v42, 1 -; VI-NEXT: v_readlane_b32 s30, v42, 0 -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_mov_b32_e32 v54, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[52:53] +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[42:43] +; VI-NEXT: v_mov_b32_e32 v52, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[50:51] +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[44:45] +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[48:49] +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[46:47] +; VI-NEXT: v_mov_b32_e32 v31, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[15:16] +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: .LBB105_3: ; %end +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_mov_b32_e32 v18, v40 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v1, v39 +; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v9, v35 +; VI-NEXT: v_mov_b32_e32 v11, v34 +; VI-NEXT: v_mov_b32_e32 v15, v55 +; VI-NEXT: v_mov_b32_e32 v17, v54 +; VI-NEXT: v_mov_b32_e32 v19, v53 +; VI-NEXT: v_mov_b32_e32 v21, v52 +; VI-NEXT: v_mov_b32_e32 v23, v51 +; VI-NEXT: v_mov_b32_e32 v25, v50 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v29, v48 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB105_4: +; VI-NEXT: s_branch .LBB105_2 ; ; GFX9-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s30, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v31, v17 ; GFX9-NEXT: v_mov_b32_e32 v30, v16 ; GFX9-NEXT: v_mov_b32_e32 v29, v15 @@ -236781,626 +235526,637 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v32, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: s_cbranch_execnz .LBB105_3 ; GFX9-NEXT: .LBB105_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff0000 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_and_or_b32 v14, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_and_or_b32 v15, v3, v18, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v13, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v12, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v11, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v10, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v9, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v8, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v7, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v6, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v5, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v4, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v3, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v2, v1, v18, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v16 ; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v33 -; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v33, v18, v0 -; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 ; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 ; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v15, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 ; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc -; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v32 ; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 ; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 ; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v32 ; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc -; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 ; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v19 ; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc -; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v20 ; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 ; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v20 ; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc -; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v21 ; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 ; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v21 ; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc -; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 ; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v22 ; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc -; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v23 ; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 ; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 ; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v23 ; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc -; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v24 ; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 ; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v24 ; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc -; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v25 ; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 ; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v25 ; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc -; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 ; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 ; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 ; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v26 ; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc -; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v27 ; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 ; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v27 ; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc -; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v28 ; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 ; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v28 ; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc -; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 ; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v29 ; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc -; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v30 ; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v30 ; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc -; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v31 ; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc -; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 ; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 ; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v0 ; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_and_or_b32 v31, v40, v18, v31 -; GFX9-NEXT: v_and_or_b32 v30, v55, v18, v30 -; GFX9-NEXT: v_and_or_b32 v29, v54, v18, v29 -; GFX9-NEXT: v_and_or_b32 v28, v53, v18, v28 -; GFX9-NEXT: v_and_or_b32 v27, v52, v18, v27 -; GFX9-NEXT: v_and_or_b32 v26, v51, v18, v26 -; GFX9-NEXT: v_and_or_b32 v25, v50, v18, v25 -; GFX9-NEXT: v_and_or_b32 v24, v49, v18, v24 -; GFX9-NEXT: v_and_or_b32 v23, v48, v18, v23 -; GFX9-NEXT: v_and_or_b32 v22, v39, v18, v22 -; GFX9-NEXT: v_and_or_b32 v21, v38, v18, v21 -; GFX9-NEXT: v_and_or_b32 v20, v37, v18, v20 -; GFX9-NEXT: v_and_or_b32 v19, v36, v18, v19 -; GFX9-NEXT: v_and_or_b32 v32, v35, v18, v32 -; GFX9-NEXT: v_and_or_b32 v17, v34, v18, v17 -; GFX9-NEXT: v_and_or_b32 v16, v33, v18, v16 -; GFX9-NEXT: s_branch .LBB105_5 -; GFX9-NEXT: .LBB105_3: -; GFX9-NEXT: s_branch .LBB105_2 -; GFX9-NEXT: .LBB105_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB105_5: ; %end -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v41, v42, vcc +; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v42, v42, v1 +; GFX9-NEXT: v_add_u32_e32 v42, 0x7fff, v42 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v42, v43, vcc +; GFX9-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v43, v43, v42 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v43, 0x7fff, v43 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v43, v43, v2 +; GFX9-NEXT: v_add_u32_e32 v43, 0x7fff, v43 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v43, v44, vcc +; GFX9-NEXT: v_and_b32_e32 v43, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v44, v44, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v44, 0x7fff, v44 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v44, v44, v3 +; GFX9-NEXT: v_add_u32_e32 v44, 0x7fff, v44 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_and_b32_e32 v44, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v45, v45, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v45, 0x7fff, v45 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v45, v45, v4 +; GFX9-NEXT: v_add_u32_e32 v45, 0x7fff, v45 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v46, v46, v45 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v46, 0x7fff, v46 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v46, v46, v5 +; GFX9-NEXT: v_add_u32_e32 v46, 0x7fff, v46 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_and_b32_e32 v46, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v47, v47, v46 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v47, 0x7fff, v47 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v47, v47, v6 +; GFX9-NEXT: v_add_u32_e32 v47, 0x7fff, v47 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_and_b32_e32 v47, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v56, v56, v47 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v56, 0x7fff, v56 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v56, v56, v7 +; GFX9-NEXT: v_add_u32_e32 v56, 0x7fff, v56 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_and_b32_e32 v56, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v57, v57, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v57, 0x7fff, v57 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v57, v57, v8 +; GFX9-NEXT: v_add_u32_e32 v57, 0x7fff, v57 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_and_b32_e32 v57, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v58, v58, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v58, 0x7fff, v58 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v58, v58, v9 +; GFX9-NEXT: v_add_u32_e32 v58, 0x7fff, v58 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_and_b32_e32 v58, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v59, v59, v58 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v59, 0x7fff, v59 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v59, v59, v10 +; GFX9-NEXT: v_add_u32_e32 v59, 0x7fff, v59 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_and_b32_e32 v59, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v60, v60, v59 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v60, 0x7fff, v60 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v60, v60, v11 +; GFX9-NEXT: v_add_u32_e32 v60, 0x7fff, v60 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_and_b32_e32 v60, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v61, v61, v60 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v61, 0x7fff, v61 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v61, v61, v12 +; GFX9-NEXT: v_add_u32_e32 v61, 0x7fff, v61 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_and_b32_e32 v61, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v61 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_mov_b32_e32 v1, v63 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v13 +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_and_b32_e32 v62, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_add_u32_e32 v62, v62, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v63, v0, vcc +; GFX9-NEXT: v_add_u32_e32 v62, 0x7fff, v62 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v63, v15, vcc +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v63, v63, v62 +; GFX9-NEXT: v_add_u32_e32 v63, 0x7fff, v63 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v62, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v63, v15, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; GFX9-NEXT: v_and_or_b32 v14, v2, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-NEXT: v_and_or_b32 v13, v61, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: v_and_or_b32 v12, v60, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: v_and_or_b32 v11, v59, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: v_and_or_b32 v10, v58, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: v_and_or_b32 v9, v57, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: v_and_or_b32 v8, v56, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX9-NEXT: v_and_or_b32 v7, v47, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_and_or_b32 v6, v46, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_or_b32 v5, v45, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_and_or_b32 v4, v44, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_or_b32 v3, v43, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: v_and_or_b32 v31, v55, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_and_or_b32 v30, v54, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: v_and_or_b32 v29, v53, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: v_and_or_b32 v28, v52, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: v_and_or_b32 v27, v51, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: v_and_or_b32 v26, v50, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_and_or_b32 v25, v49, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: v_and_or_b32 v24, v48, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_and_or_b32 v23, v39, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: v_and_or_b32 v22, v38, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; GFX9-NEXT: v_and_or_b32 v21, v37, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: v_and_or_b32 v20, v36, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GFX9-NEXT: v_and_or_b32 v19, v35, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: v_and_or_b32 v32, v34, v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: v_and_or_b32 v2, v42, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_and_or_b32 v1, v41, v62, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX9-NEXT: v_and_or_b32 v0, v40, v62, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v17, v16, v62, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v16, v16, v62, v15 +; GFX9-NEXT: .LBB105_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v15, v63 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v18, v32 -; GFX9-NEXT: v_readlane_b32 s31, v43, 1 -; GFX9-NEXT: v_readlane_b32 s30, v43, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: s_branch .LBB105_2 ; ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -237720,11 +236476,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v54, v0, v4 :: v_dual_add_nc_u32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v0, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v6, v7, vcc_lo @@ -237752,7 +236508,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7 ; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v1, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 @@ -237788,7 +236544,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v11, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v55.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v66.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 @@ -237986,7 +236742,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v81, v100, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v55.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v54.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v96, v101, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v97.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -240653,120 +239409,146 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; VI-NEXT: v_writelane_b32 v32, s35, 3 ; VI-NEXT: v_writelane_b32 v32, s36, 4 ; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_readfirstlane_b32 s57, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_readfirstlane_b32 s56, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_readfirstlane_b32 s47, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_readfirstlane_b32 s46, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_readfirstlane_b32 s45, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_readfirstlane_b32 s44, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_readfirstlane_b32 s43, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_readfirstlane_b32 s42, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_readfirstlane_b32 s40, v11 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s26, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v15 +; VI-NEXT: v_readfirstlane_b32 s18, v16 +; VI-NEXT: v_readfirstlane_b32 s16, v17 +; VI-NEXT: v_readfirstlane_b32 s27, v19 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: v_readfirstlane_b32 s23, v3 +; VI-NEXT: v_readfirstlane_b32 s21, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_readfirstlane_b32 s15, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s11, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s7, v14 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: v_writelane_b32 v32, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s57, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s57, 3 +; VI-NEXT: s_and_b32 s28, s56, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s56, 3 +; VI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: s_and_b32 s57, s46, 0xffff0000 ; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s58, s45, 0xffff0000 ; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s59, s44, 0xffff0000 ; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s60, s43, 0xffff0000 ; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s61, s42, 0xffff0000 ; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s62, s41, 0xffff0000 ; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s63, s40, 0xffff0000 ; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s72, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s73, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s74, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s75, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s76, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s78, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s79, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s88, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s89, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s90, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s91, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 vcc_lo, s15, 0xffff0000 ; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_and_b32 vcc_hi, s14, 0xffff0000 ; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s30, s13, 0xffff0000 ; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s31, s12, 0xffff0000 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s34, s11, 0xffff0000 ; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s35, s10, 0xffff0000 ; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s36, s8, 0xffff0000 ; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s37, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s39, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_and_b32 s13, s13, 0xffff ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_and_b32 s40, s40, 0xffff ; VI-NEXT: s_and_b32 s41, s41, 0xffff ; VI-NEXT: s_and_b32 s42, s42, 0xffff @@ -240774,63 +239556,63 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_and_b32 s45, s45, 0xffff ; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s47, s47, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s9, s39, s9 ; VI-NEXT: s_or_b32 s6, s38, s6 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s7, s37, s7 +; VI-NEXT: s_or_b32 s8, s36, s8 +; VI-NEXT: s_or_b32 s10, s35, s10 +; VI-NEXT: s_or_b32 s11, s34, s11 +; VI-NEXT: s_or_b32 s12, s31, s12 +; VI-NEXT: s_or_b32 s13, s30, s13 +; VI-NEXT: s_or_b32 s14, vcc_hi, s14 +; VI-NEXT: s_or_b32 s15, vcc_lo, s15 +; VI-NEXT: s_or_b32 s17, s91, s17 +; VI-NEXT: s_or_b32 s19, s90, s19 +; VI-NEXT: s_or_b32 s21, s89, s21 +; VI-NEXT: s_or_b32 s23, s88, s23 +; VI-NEXT: s_or_b32 s25, s79, s25 +; VI-NEXT: s_or_b32 s27, s78, s27 +; VI-NEXT: s_or_b32 s16, s77, s16 +; VI-NEXT: s_or_b32 s18, s76, s18 +; VI-NEXT: s_or_b32 s20, s75, s20 +; VI-NEXT: s_or_b32 s22, s74, s22 +; VI-NEXT: s_or_b32 s24, s73, s24 +; VI-NEXT: s_or_b32 s26, s72, s26 +; VI-NEXT: s_or_b32 s40, s63, s40 +; VI-NEXT: s_or_b32 s41, s62, s41 +; VI-NEXT: s_or_b32 s42, s61, s42 +; VI-NEXT: s_or_b32 s43, s60, s43 +; VI-NEXT: s_or_b32 s44, s59, s44 +; VI-NEXT: s_or_b32 s45, s58, s45 +; VI-NEXT: s_or_b32 s46, s57, s46 +; VI-NEXT: s_or_b32 s47, s56, s47 +; VI-NEXT: s_or_b32 s28, s28, s29 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s10, s10, 0x30000 ; VI-NEXT: s_add_i32 s11, s11, 0x30000 ; VI-NEXT: s_add_i32 s12, s12, 0x30000 ; VI-NEXT: s_add_i32 s13, s13, 0x30000 ; VI-NEXT: s_add_i32 s14, s14, 0x30000 ; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 ; VI-NEXT: s_add_i32 s40, s40, 0x30000 ; VI-NEXT: s_add_i32 s41, s41, 0x30000 ; VI-NEXT: s_add_i32 s42, s42, 0x30000 @@ -240838,40 +239620,42 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; VI-NEXT: s_add_i32 s44, s44, 0x30000 ; VI-NEXT: s_add_i32 s45, s45, 0x30000 ; VI-NEXT: s_add_i32 s46, s46, 0x30000 -; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_add_i32 s47, s47, 0x30000 +; VI-NEXT: s_add_i32 s56, s28, 0x30000 +; VI-NEXT: s_add_i32 s57, s4, 0x30000 ; VI-NEXT: .LBB107_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s27 +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s21 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_mov_b32_e32 v15, s9 +; VI-NEXT: v_mov_b32_e32 v16, s57 +; VI-NEXT: v_mov_b32_e32 v17, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v19, s46 +; VI-NEXT: v_mov_b32_e32 v20, s45 +; VI-NEXT: v_mov_b32_e32 v21, s44 +; VI-NEXT: v_mov_b32_e32 v22, s43 +; VI-NEXT: v_mov_b32_e32 v23, s42 +; VI-NEXT: v_mov_b32_e32 v24, s41 +; VI-NEXT: v_mov_b32_e32 v25, s40 +; VI-NEXT: v_mov_b32_e32 v26, s26 +; VI-NEXT: v_mov_b32_e32 v27, s24 +; VI-NEXT: v_mov_b32_e32 v28, s22 +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v30, s18 +; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_readlane_b32 s39, v32, 7 ; VI-NEXT: v_readlane_b32 s38, v32, 6 ; VI-NEXT: v_readlane_b32 s37, v32, 5 @@ -245288,120 +244072,146 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; VI-NEXT: v_writelane_b32 v32, s35, 3 ; VI-NEXT: v_writelane_b32 v32, s36, 4 ; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_readfirstlane_b32 s57, v2 +; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_readfirstlane_b32 s56, v3 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_readfirstlane_b32 s47, v4 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_readfirstlane_b32 s46, v5 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_readfirstlane_b32 s45, v6 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_readfirstlane_b32 s44, v7 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_readfirstlane_b32 s43, v8 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_readfirstlane_b32 s42, v9 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_readfirstlane_b32 s41, v10 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_readfirstlane_b32 s40, v11 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_readfirstlane_b32 s26, v12 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_mov_b32_e32 v14, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_writelane_b32 v32, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s47, v2 -; VI-NEXT: v_readfirstlane_b32 s46, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s44, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s42, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s40, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s14, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s12, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s10, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v15 +; VI-NEXT: v_readfirstlane_b32 s18, v16 +; VI-NEXT: v_readfirstlane_b32 s16, v17 +; VI-NEXT: v_readfirstlane_b32 s27, v19 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: v_readfirstlane_b32 s23, v3 +; VI-NEXT: v_readfirstlane_b32 s21, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s17, v6 +; VI-NEXT: v_readfirstlane_b32 s15, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s11, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s7, v14 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: v_writelane_b32 v32, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s47, 3 -; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s57, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s57, 3 +; VI-NEXT: s_and_b32 s28, s56, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s56, 3 +; VI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: s_and_b32 s57, s46, 0xffff0000 ; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s58, s45, 0xffff0000 ; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s59, s44, 0xffff0000 ; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s60, s43, 0xffff0000 ; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s61, s42, 0xffff0000 ; VI-NEXT: s_add_i32 s42, s42, 3 -; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s62, s41, 0xffff0000 ; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s63, s40, 0xffff0000 ; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s72, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s73, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s74, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s75, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s76, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s78, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s79, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s88, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s89, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s90, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s91, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 vcc_lo, s15, 0xffff0000 ; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_and_b32 vcc_hi, s14, 0xffff0000 ; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s30, s13, 0xffff0000 ; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s31, s12, 0xffff0000 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s34, s11, 0xffff0000 ; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s35, s10, 0xffff0000 ; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s36, s8, 0xffff0000 ; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s37, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s39, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s24, s24, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_and_b32 s13, s13, 0xffff ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_and_b32 s40, s40, 0xffff ; VI-NEXT: s_and_b32 s41, s41, 0xffff ; VI-NEXT: s_and_b32 s42, s42, 0xffff @@ -245409,63 +244219,63 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_and_b32 s45, s45, 0xffff ; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s47, s47, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s9, s39, s9 ; VI-NEXT: s_or_b32 s6, s38, s6 -; VI-NEXT: s_or_b32 s29, s37, s29 -; VI-NEXT: s_or_b32 s28, s36, s28 -; VI-NEXT: s_or_b32 s27, s35, s27 -; VI-NEXT: s_or_b32 s26, s34, s26 -; VI-NEXT: s_or_b32 s25, s31, s25 -; VI-NEXT: s_or_b32 s24, s30, s24 -; VI-NEXT: s_or_b32 s23, vcc_hi, s23 -; VI-NEXT: s_or_b32 s22, vcc_lo, s22 -; VI-NEXT: s_or_b32 s21, s91, s21 -; VI-NEXT: s_or_b32 s20, s90, s20 -; VI-NEXT: s_or_b32 s19, s89, s19 -; VI-NEXT: s_or_b32 s18, s88, s18 -; VI-NEXT: s_or_b32 s17, s79, s17 -; VI-NEXT: s_or_b32 s16, s78, s16 -; VI-NEXT: s_or_b32 s8, s77, s8 -; VI-NEXT: s_or_b32 s9, s76, s9 -; VI-NEXT: s_or_b32 s10, s75, s10 -; VI-NEXT: s_or_b32 s11, s74, s11 -; VI-NEXT: s_or_b32 s12, s73, s12 -; VI-NEXT: s_or_b32 s13, s72, s13 -; VI-NEXT: s_or_b32 s14, s63, s14 -; VI-NEXT: s_or_b32 s15, s62, s15 -; VI-NEXT: s_or_b32 s40, s61, s40 -; VI-NEXT: s_or_b32 s41, s60, s41 -; VI-NEXT: s_or_b32 s42, s59, s42 -; VI-NEXT: s_or_b32 s43, s58, s43 -; VI-NEXT: s_or_b32 s44, s57, s44 -; VI-NEXT: s_or_b32 s45, s56, s45 -; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s7, s37, s7 +; VI-NEXT: s_or_b32 s8, s36, s8 +; VI-NEXT: s_or_b32 s10, s35, s10 +; VI-NEXT: s_or_b32 s11, s34, s11 +; VI-NEXT: s_or_b32 s12, s31, s12 +; VI-NEXT: s_or_b32 s13, s30, s13 +; VI-NEXT: s_or_b32 s14, vcc_hi, s14 +; VI-NEXT: s_or_b32 s15, vcc_lo, s15 +; VI-NEXT: s_or_b32 s17, s91, s17 +; VI-NEXT: s_or_b32 s19, s90, s19 +; VI-NEXT: s_or_b32 s21, s89, s21 +; VI-NEXT: s_or_b32 s23, s88, s23 +; VI-NEXT: s_or_b32 s25, s79, s25 +; VI-NEXT: s_or_b32 s27, s78, s27 +; VI-NEXT: s_or_b32 s16, s77, s16 +; VI-NEXT: s_or_b32 s18, s76, s18 +; VI-NEXT: s_or_b32 s20, s75, s20 +; VI-NEXT: s_or_b32 s22, s74, s22 +; VI-NEXT: s_or_b32 s24, s73, s24 +; VI-NEXT: s_or_b32 s26, s72, s26 +; VI-NEXT: s_or_b32 s40, s63, s40 +; VI-NEXT: s_or_b32 s41, s62, s41 +; VI-NEXT: s_or_b32 s42, s61, s42 +; VI-NEXT: s_or_b32 s43, s60, s43 +; VI-NEXT: s_or_b32 s44, s59, s44 +; VI-NEXT: s_or_b32 s45, s58, s45 +; VI-NEXT: s_or_b32 s46, s57, s46 +; VI-NEXT: s_or_b32 s47, s56, s47 +; VI-NEXT: s_or_b32 s28, s28, s29 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 -; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s10, s10, 0x30000 ; VI-NEXT: s_add_i32 s11, s11, 0x30000 ; VI-NEXT: s_add_i32 s12, s12, 0x30000 ; VI-NEXT: s_add_i32 s13, s13, 0x30000 ; VI-NEXT: s_add_i32 s14, s14, 0x30000 ; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 ; VI-NEXT: s_add_i32 s40, s40, 0x30000 ; VI-NEXT: s_add_i32 s41, s41, 0x30000 ; VI-NEXT: s_add_i32 s42, s42, 0x30000 @@ -245473,40 +244283,42 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; VI-NEXT: s_add_i32 s44, s44, 0x30000 ; VI-NEXT: s_add_i32 s45, s45, 0x30000 ; VI-NEXT: s_add_i32 s46, s46, 0x30000 -; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_add_i32 s47, s47, 0x30000 +; VI-NEXT: s_add_i32 s56, s28, 0x30000 +; VI-NEXT: s_add_i32 s57, s4, 0x30000 ; VI-NEXT: .LBB111_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s27 +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s21 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 -; VI-NEXT: v_mov_b32_e32 v16, s47 -; VI-NEXT: v_mov_b32_e32 v17, s46 -; VI-NEXT: v_mov_b32_e32 v18, s45 -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: v_mov_b32_e32 v20, s43 -; VI-NEXT: v_mov_b32_e32 v21, s42 -; VI-NEXT: v_mov_b32_e32 v22, s41 -; VI-NEXT: v_mov_b32_e32 v23, s40 -; VI-NEXT: v_mov_b32_e32 v24, s15 -; VI-NEXT: v_mov_b32_e32 v25, s14 -; VI-NEXT: v_mov_b32_e32 v26, s13 -; VI-NEXT: v_mov_b32_e32 v27, s12 -; VI-NEXT: v_mov_b32_e32 v28, s11 -; VI-NEXT: v_mov_b32_e32 v29, s10 -; VI-NEXT: v_mov_b32_e32 v30, s9 -; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_mov_b32_e32 v15, s9 +; VI-NEXT: v_mov_b32_e32 v16, s57 +; VI-NEXT: v_mov_b32_e32 v17, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v19, s46 +; VI-NEXT: v_mov_b32_e32 v20, s45 +; VI-NEXT: v_mov_b32_e32 v21, s44 +; VI-NEXT: v_mov_b32_e32 v22, s43 +; VI-NEXT: v_mov_b32_e32 v23, s42 +; VI-NEXT: v_mov_b32_e32 v24, s41 +; VI-NEXT: v_mov_b32_e32 v25, s40 +; VI-NEXT: v_mov_b32_e32 v26, s26 +; VI-NEXT: v_mov_b32_e32 v27, s24 +; VI-NEXT: v_mov_b32_e32 v28, s22 +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v30, s18 +; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_readlane_b32 s39, v32, 7 ; VI-NEXT: v_readlane_b32 s38, v32, 6 ; VI-NEXT: v_readlane_b32 s37, v32, 5 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 22dd3a0438136..231460f584a2e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -24269,13 +24269,13 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 @@ -24285,22 +24285,22 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[0:1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[2:3] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[0:1] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow @@ -24372,32 +24372,32 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v12.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[20:21] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v20 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v17.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v20.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v21.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index ee209f84efe7c..a8c54e8655882 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -424,7 +424,7 @@ define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 @@ -439,7 +439,7 @@ define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1634,7 +1634,7 @@ define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 @@ -1649,7 +1649,7 @@ define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index 7d0897bb2151b..e3b374b712717 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -476,7 +476,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 @@ -496,7 +496,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 @@ -1939,7 +1939,7 @@ define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 @@ -1959,7 +1959,7 @@ define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 57de868ad37b3..4e60831ca3da5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -526,7 +526,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 @@ -551,7 +551,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 @@ -2240,7 +2240,7 @@ define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 @@ -2265,7 +2265,7 @@ define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 5b42f951b8fa3..6fae7fdbbf9bb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -20108,7 +20108,7 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -20954,7 +20954,7 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index f335b48ba4ae1..bd8c305606364 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -581,7 +581,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 @@ -611,7 +611,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 @@ -2541,7 +2541,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 ; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 @@ -2571,7 +2571,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 2cde373ec130c..4f6801a4dcdfd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -7980,7 +7980,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10034,7 +10034,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 718851f97bade..7fbc631c10e34 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2383,7 +2383,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -5942,7 +5942,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -8862,7 +8862,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -11188,7 +11188,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 88d521a0eaa8b..4cf6ed8855818 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2144,112 +2144,110 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s5, s5, 3 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s63 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s40 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s61 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s60 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s12 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s59 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s10 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s58 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s8 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s57 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_mov_b32_e32 v29, s6 -; SI-NEXT: v_mov_b32_e32 v30, s5 -; SI-NEXT: v_mov_b32_e32 v31, s56 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: @@ -2882,111 +2880,139 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; VI-LABEL: bitcast_v32i16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB15_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: ; VI-NEXT: s_branch .LBB15_2 @@ -3403,124 +3429,152 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-LABEL: bitcast_v16i32_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s18, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s16, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v1 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s5, s20, 16 +; SI-NEXT: s_lshr_b32 s22, s19, 16 +; SI-NEXT: s_lshr_b32 s23, s18, 16 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: s_lshr_b32 s27, s14, 16 +; SI-NEXT: s_lshr_b32 s28, s13, 16 +; SI-NEXT: s_lshr_b32 s29, s12, 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s10, 16 +; SI-NEXT: s_lshr_b32 s42, s8, 16 +; SI-NEXT: s_lshr_b32 s43, s7, 16 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB17_3: ; %end @@ -4788,7 +4842,35 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s56, v3 +; SI-NEXT: v_readfirstlane_b32 s57, v4 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s60, v7 +; SI-NEXT: v_readfirstlane_b32 s61, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v9 +; SI-NEXT: v_readfirstlane_b32 s63, v10 +; SI-NEXT: v_readfirstlane_b32 s72, v11 +; SI-NEXT: v_readfirstlane_b32 s73, v12 +; SI-NEXT: v_readfirstlane_b32 s74, v13 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v15 +; SI-NEXT: v_readfirstlane_b32 s77, v16 ; SI-NEXT: v_readfirstlane_b32 s78, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s79, v1 @@ -4798,107 +4880,107 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s72, s72, 3 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s76, s76, 3 +; SI-NEXT: s_add_i32 s77, s77, 3 ; SI-NEXT: s_add_i32 s78, s78, 3 ; SI-NEXT: s_add_i32 s79, s79, 3 ; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s77 -; SI-NEXT: v_mov_b32_e32 v1, s76 -; SI-NEXT: v_mov_b32_e32 v2, s75 -; SI-NEXT: v_mov_b32_e32 v3, s74 -; SI-NEXT: v_mov_b32_e32 v4, s73 -; SI-NEXT: v_mov_b32_e32 v5, s72 -; SI-NEXT: v_mov_b32_e32 v6, s63 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s61 -; SI-NEXT: v_mov_b32_e32 v9, s60 -; SI-NEXT: v_mov_b32_e32 v10, s59 -; SI-NEXT: v_mov_b32_e32 v11, s58 -; SI-NEXT: v_mov_b32_e32 v12, s57 -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: v_mov_b32_e32 v14, s47 -; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v16, s45 -; SI-NEXT: v_mov_b32_e32 v17, s44 -; SI-NEXT: v_mov_b32_e32 v18, s43 -; SI-NEXT: v_mov_b32_e32 v19, s42 -; SI-NEXT: v_mov_b32_e32 v20, s41 -; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v21, s16 ; SI-NEXT: v_mov_b32_e32 v22, s15 ; SI-NEXT: v_mov_b32_e32 v23, s14 ; SI-NEXT: v_mov_b32_e32 v24, s13 @@ -4911,20 +4993,6 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: v_mov_b32_e32 v31, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 @@ -4933,6 +5001,20 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 @@ -6708,695 +6790,665 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: s_cbranch_execnz .LBB23_3 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] -; VI-NEXT: v_add_f32_e32 v13, s7, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] -; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: v_mov_b32_e32 v15, v16 -; VI-NEXT: s_branch .LBB23_5 -; VI-NEXT: .LBB23_3: -; VI-NEXT: s_branch .LBB23_2 -; VI-NEXT: .LBB23_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 ; GFX9-NEXT: .LBB23_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB23_5 -; GFX9-NEXT: .LBB23_3: -; GFX9-NEXT: s_branch .LBB23_2 -; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB23_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -9751,345 +9803,373 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s30, 0 -; SI-NEXT: v_writelane_b32 v4, s31, 1 -; SI-NEXT: v_writelane_b32 v4, s34, 2 -; SI-NEXT: v_writelane_b32 v4, s35, 3 -; SI-NEXT: v_writelane_b32 v4, s36, 4 -; SI-NEXT: v_writelane_b32 v4, s37, 5 -; SI-NEXT: v_writelane_b32 v4, s38, 6 -; SI-NEXT: v_writelane_b32 v4, s39, 7 -; SI-NEXT: v_writelane_b32 v4, s48, 8 -; SI-NEXT: v_writelane_b32 v4, s49, 9 -; SI-NEXT: v_writelane_b32 v4, s50, 10 -; SI-NEXT: v_writelane_b32 v4, s51, 11 -; SI-NEXT: v_writelane_b32 v4, s52, 12 -; SI-NEXT: v_writelane_b32 v4, s53, 13 -; SI-NEXT: v_writelane_b32 v4, s54, 14 -; SI-NEXT: v_writelane_b32 v4, s55, 15 -; SI-NEXT: v_writelane_b32 v4, s64, 16 -; SI-NEXT: v_writelane_b32 v4, s65, 17 -; SI-NEXT: v_writelane_b32 v4, s66, 18 -; SI-NEXT: v_writelane_b32 v4, s67, 19 -; SI-NEXT: v_writelane_b32 v4, s68, 20 -; SI-NEXT: v_writelane_b32 v4, s69, 21 -; SI-NEXT: v_writelane_b32 v4, s70, 22 -; SI-NEXT: v_writelane_b32 v4, s71, 23 -; SI-NEXT: v_writelane_b32 v4, s80, 24 -; SI-NEXT: v_writelane_b32 v4, s81, 25 -; SI-NEXT: v_writelane_b32 v4, s82, 26 -; SI-NEXT: v_writelane_b32 v4, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s20, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[18:19], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_writelane_b32 v4, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s21, 24 +; SI-NEXT: s_lshr_b32 s84, s21, 16 +; SI-NEXT: s_lshr_b32 s85, s21, 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s21, 24 +; SI-NEXT: s_lshr_b32 s84, s21, 16 +; SI-NEXT: s_lshr_b32 s85, s21, 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_lshl_b32 s7, s36, 8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s34, 0xff -; SI-NEXT: s_lshl_b32 s11, s30, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b32 s7, s17, 0xff -; SI-NEXT: s_lshl_b32 s9, s85, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s84, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s83, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_lshl_b32 s7, s94, 8 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s92, 0xff -; SI-NEXT: s_lshl_b32 s11, s90, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: s_and_b32 s7, s19, 0xff -; SI-NEXT: s_lshl_b32 s9, s82, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s81, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s80, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s19, s36, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s20, s34, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_lshl_b32 s20, s85, 8 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s84, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s21, s83, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: s_lshl_b32 s19, s94, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s19 +; SI-NEXT: s_and_b32 s19, s92, 0xff +; SI-NEXT: s_lshl_b32 s20, s90, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s19 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s82, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s80, 24 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s76, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s78, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s76, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s74, 24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s21, 0xff -; SI-NEXT: s_lshl_b32 s9, s71, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s70, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s69, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s71, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s88, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s72, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s62, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s72, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s62, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s23, 0xff -; SI-NEXT: s_lshl_b32 s9, s68, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s67, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s66, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s68, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s67, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s66, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s9, s60, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s56, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s58, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s56, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s25, 0xff -; SI-NEXT: s_lshl_b32 s9, s65, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s64, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s55, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s65, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s55, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s9, s46, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s44, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s46, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s44, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s42, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s54, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s53, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s52, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_lshl_b32 s10, s52, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s40, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s14, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s40, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s29, 0xff -; SI-NEXT: s_lshl_b32 s9, s51, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s50, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s49, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s51, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s50, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s49, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s7, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s7 -; SI-NEXT: s_and_b32 s7, s8, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s24, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s18, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -10109,38 +10189,38 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s85, v4, 29 -; SI-NEXT: v_readlane_b32 s84, v4, 28 -; SI-NEXT: v_readlane_b32 s83, v4, 27 -; SI-NEXT: v_readlane_b32 s82, v4, 26 -; SI-NEXT: v_readlane_b32 s81, v4, 25 -; SI-NEXT: v_readlane_b32 s80, v4, 24 -; SI-NEXT: v_readlane_b32 s71, v4, 23 -; SI-NEXT: v_readlane_b32 s70, v4, 22 -; SI-NEXT: v_readlane_b32 s69, v4, 21 -; SI-NEXT: v_readlane_b32 s68, v4, 20 -; SI-NEXT: v_readlane_b32 s67, v4, 19 -; SI-NEXT: v_readlane_b32 s66, v4, 18 -; SI-NEXT: v_readlane_b32 s65, v4, 17 -; SI-NEXT: v_readlane_b32 s64, v4, 16 -; SI-NEXT: v_readlane_b32 s55, v4, 15 -; SI-NEXT: v_readlane_b32 s54, v4, 14 -; SI-NEXT: v_readlane_b32 s53, v4, 13 -; SI-NEXT: v_readlane_b32 s52, v4, 12 -; SI-NEXT: v_readlane_b32 s51, v4, 11 -; SI-NEXT: v_readlane_b32 s50, v4, 10 -; SI-NEXT: v_readlane_b32 s49, v4, 9 -; SI-NEXT: v_readlane_b32 s48, v4, 8 -; SI-NEXT: v_readlane_b32 s39, v4, 7 -; SI-NEXT: v_readlane_b32 s38, v4, 6 -; SI-NEXT: v_readlane_b32 s37, v4, 5 -; SI-NEXT: v_readlane_b32 s36, v4, 4 -; SI-NEXT: v_readlane_b32 s35, v4, 3 -; SI-NEXT: v_readlane_b32 s34, v4, 2 -; SI-NEXT: v_readlane_b32 s31, v4, 1 -; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10188,43 +10268,71 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -10232,287 +10340,287 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB25_3 ; VI-NEXT: .LBB25_2: ; %cmp.true -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_add_i32 s19, s19, 3 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 ; VI-NEXT: s_add_i32 s5, s5, 3 ; VI-NEXT: s_add_i32 s4, s4, 3 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB25_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -10533,28 +10641,28 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -10579,31 +10687,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -10613,28 +10721,56 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -10642,275 +10778,275 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB25_3 ; GFX9-NEXT: .LBB25_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s17, s17, 3 -; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s19, s19, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_add_i32 s21, s21, 3 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_add_i32 s23, s23, 3 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_add_i32 s25, s25, 3 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s5, s5, 3 ; GFX9-NEXT: s_add_i32 s4, s4, 3 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: .LBB25_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s51, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s51, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s50, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s48, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s37, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s36, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s37, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s35, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s34, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s31, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s35, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s31, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s95, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s30, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s94, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s93, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s92, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s94, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s92, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s91, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s91, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s89, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s79, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s89, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s79, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s78, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s78, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s76, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s75, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s74, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s72, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s62, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s61, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s60, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s59, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -10928,24 +11064,24 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10970,31 +11106,31 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr57 ; GFX9-NEXT: ; implicit-def: $sgpr56 @@ -17840,65 +17976,43 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; VI-LABEL: bitcast_v32i16_to_v16f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB39_3 ; VI-NEXT: .LBB39_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -17927,24 +18041,74 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB39_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB39_4: ; VI-NEXT: s_branch .LBB39_2 @@ -18354,79 +18518,91 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_mov_b32_e32 v55, s16 +; SI-NEXT: v_mov_b32_e32 v54, s17 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: v_mov_b32_e32 v52, s19 +; SI-NEXT: v_mov_b32_e32 v51, s20 +; SI-NEXT: v_mov_b32_e32 v50, s21 +; SI-NEXT: v_mov_b32_e32 v49, s22 +; SI-NEXT: v_mov_b32_e32 v48, s23 +; SI-NEXT: v_mov_b32_e32 v39, s24 +; SI-NEXT: v_mov_b32_e32 v38, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v35, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v30, s7, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 @@ -18439,10 +18615,10 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -18456,7 +18632,7 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -18471,13 +18647,15 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -21640,695 +21818,665 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: s_cbranch_execnz .LBB47_3 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] -; VI-NEXT: v_add_f32_e32 v13, s7, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] -; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: v_mov_b32_e32 v15, v16 -; VI-NEXT: s_branch .LBB47_5 -; VI-NEXT: .LBB47_3: -; VI-NEXT: s_branch .LBB47_2 -; VI-NEXT: .LBB47_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB47_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: s_branch .LBB47_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 ; GFX9-NEXT: .LBB47_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB47_5 -; GFX9-NEXT: .LBB47_3: -; GFX9-NEXT: s_branch .LBB47_2 -; GFX9-NEXT: .LBB47_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB47_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB47_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: s_branch .LBB47_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -24698,92 +24846,120 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v40, s81, 25 ; SI-NEXT: v_writelane_b32 v40, s82, 26 ; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s36, v1 +; SI-NEXT: v_readfirstlane_b32 s36, v4 +; SI-NEXT: v_readfirstlane_b32 s37, v5 +; SI-NEXT: v_readfirstlane_b32 s34, v6 +; SI-NEXT: v_readfirstlane_b32 s35, v7 +; SI-NEXT: v_readfirstlane_b32 s30, v8 +; SI-NEXT: v_readfirstlane_b32 s31, v9 +; SI-NEXT: v_readfirstlane_b32 s94, v10 +; SI-NEXT: v_readfirstlane_b32 s95, v11 +; SI-NEXT: v_readfirstlane_b32 s92, v12 +; SI-NEXT: v_readfirstlane_b32 s93, v13 +; SI-NEXT: v_readfirstlane_b32 s90, v14 +; SI-NEXT: v_readfirstlane_b32 s91, v15 +; SI-NEXT: v_readfirstlane_b32 s88, v16 +; SI-NEXT: v_readfirstlane_b32 s89, v17 +; SI-NEXT: v_readfirstlane_b32 s78, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s37, v2 +; SI-NEXT: v_readfirstlane_b32 s79, v2 ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s82, s37, 24 -; SI-NEXT: s_lshr_b32 s84, s37, 16 -; SI-NEXT: s_lshr_b32 s85, s37, 8 -; SI-NEXT: s_lshr_b32 s71, s29, 24 -; SI-NEXT: s_lshr_b32 s81, s29, 16 -; SI-NEXT: s_lshr_b32 s83, s29, 8 -; SI-NEXT: s_lshr_b32 s68, s27, 24 -; SI-NEXT: s_lshr_b32 s70, s27, 16 -; SI-NEXT: s_lshr_b32 s80, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s67, s25, 16 -; SI-NEXT: s_lshr_b32 s69, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 24 -; SI-NEXT: s_lshr_b32 s64, s23, 16 -; SI-NEXT: s_lshr_b32 s66, s23, 8 -; SI-NEXT: s_lshr_b32 s51, s21, 24 -; SI-NEXT: s_lshr_b32 s53, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s21, 8 -; SI-NEXT: s_lshr_b32 s48, s19, 24 -; SI-NEXT: s_lshr_b32 s50, s19, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 8 -; SI-NEXT: s_lshr_b32 s38, s17, 24 -; SI-NEXT: s_lshr_b32 s39, s17, 16 -; SI-NEXT: s_lshr_b32 s49, s17, 8 -; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[36:37], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[36:37], 8 -; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s82, s79, 24 +; SI-NEXT: s_lshr_b32 s84, s79, 16 +; SI-NEXT: s_lshr_b32 s85, s79, 8 +; SI-NEXT: s_lshr_b32 s71, s89, 24 +; SI-NEXT: s_lshr_b32 s81, s89, 16 +; SI-NEXT: s_lshr_b32 s83, s89, 8 +; SI-NEXT: s_lshr_b32 s68, s91, 24 +; SI-NEXT: s_lshr_b32 s70, s91, 16 +; SI-NEXT: s_lshr_b32 s80, s91, 8 +; SI-NEXT: s_lshr_b32 s65, s93, 24 +; SI-NEXT: s_lshr_b32 s67, s93, 16 +; SI-NEXT: s_lshr_b32 s69, s93, 8 +; SI-NEXT: s_lshr_b32 s54, s95, 24 +; SI-NEXT: s_lshr_b32 s64, s95, 16 +; SI-NEXT: s_lshr_b32 s66, s95, 8 +; SI-NEXT: s_lshr_b32 s51, s31, 24 +; SI-NEXT: s_lshr_b32 s53, s31, 16 +; SI-NEXT: s_lshr_b32 s55, s31, 8 +; SI-NEXT: s_lshr_b32 s48, s35, 24 +; SI-NEXT: s_lshr_b32 s50, s35, 16 +; SI-NEXT: s_lshr_b32 s52, s35, 8 +; SI-NEXT: s_lshr_b32 s38, s37, 24 +; SI-NEXT: s_lshr_b32 s39, s37, 16 +; SI-NEXT: s_lshr_b32 s49, s37, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[78:79], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[78:79], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[78:79], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[88:89], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[88:89], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[88:89], 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[90:91], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[90:91], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[90:91], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[92:93], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[92:93], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[92:93], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[94:95], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[94:95], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[94:95], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[30:31], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[30:31], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[30:31], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[34:35], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[34:35], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[34:35], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[36:37], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[36:37], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[36:37], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v20, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v1, s37, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s36, 1.0 -; SI-NEXT: v_readfirstlane_b32 s16, v22 -; SI-NEXT: v_readfirstlane_b32 s17, v20 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s22, v10 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_readfirstlane_b32 s24, v8 -; SI-NEXT: v_readfirstlane_b32 s25, v7 -; SI-NEXT: v_readfirstlane_b32 s26, v6 -; SI-NEXT: v_readfirstlane_b32 s27, v5 +; SI-NEXT: v_add_f32_e64 v20, s37, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s36, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s35, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s34, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s31, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s30, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s95, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s94, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s93, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s92, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s91, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s90, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s89, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s88, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s79, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s78, 1.0 +; SI-NEXT: v_readfirstlane_b32 s76, v22 +; SI-NEXT: v_readfirstlane_b32 s77, v20 +; SI-NEXT: v_readfirstlane_b32 s62, v18 +; SI-NEXT: v_readfirstlane_b32 s63, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v11 +; SI-NEXT: v_readfirstlane_b32 s28, v10 +; SI-NEXT: v_readfirstlane_b32 s29, v9 +; SI-NEXT: v_readfirstlane_b32 s26, v8 +; SI-NEXT: v_readfirstlane_b32 s27, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v6 +; SI-NEXT: v_readfirstlane_b32 s21, v5 ; SI-NEXT: v_readfirstlane_b32 s14, v4 ; SI-NEXT: v_readfirstlane_b32 s15, v3 ; SI-NEXT: v_readfirstlane_b32 s8, v2 @@ -24794,24 +24970,24 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 24 ; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[76:77], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 ; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v1 @@ -24838,21 +25014,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v20 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr51 @@ -24871,15 +25047,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 @@ -24888,22 +25064,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v9, s23 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v7, s25 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v3, s29 -; SI-NEXT: v_mov_b32_e32 v2, s36 -; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v22, s36 +; SI-NEXT: v_mov_b32_e32 v20, s37 +; SI-NEXT: v_mov_b32_e32 v18, s34 +; SI-NEXT: v_mov_b32_e32 v16, s35 +; SI-NEXT: v_mov_b32_e32 v15, s30 +; SI-NEXT: v_mov_b32_e32 v11, s31 +; SI-NEXT: v_mov_b32_e32 v10, s94 +; SI-NEXT: v_mov_b32_e32 v9, s95 +; SI-NEXT: v_mov_b32_e32 v8, s92 +; SI-NEXT: v_mov_b32_e32 v7, s93 +; SI-NEXT: v_mov_b32_e32 v6, s90 +; SI-NEXT: v_mov_b32_e32 v5, s91 +; SI-NEXT: v_mov_b32_e32 v4, s88 +; SI-NEXT: v_mov_b32_e32 v3, s89 +; SI-NEXT: v_mov_b32_e32 v2, s78 +; SI-NEXT: v_mov_b32_e32 v1, s79 ; SI-NEXT: v_mov_b32_e32 v48, s49 ; SI-NEXT: v_mov_b32_e32 v39, s39 ; SI-NEXT: v_mov_b32_e32 v38, s38 @@ -24930,11 +25106,11 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v12, s82 ; SI-NEXT: .LBB49_5: ; %end ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: v_or_b32_e32 v22, s5, v22 -; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: s_and_b32 s5, s74, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s94, 24 +; SI-NEXT: s_lshl_b32 s7, s72, 24 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_or_b32_e32 v22, s5, v22 @@ -24945,15 +25121,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v38 ; SI-NEXT: v_or_b32_e32 v18, s5, v18 -; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: s_and_b32 s5, s60, 0xff ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v22, v38, v22 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s88, 24 +; SI-NEXT: s_lshl_b32 s7, s58, 24 ; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 @@ -24969,15 +25145,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v35 ; SI-NEXT: v_or_b32_e32 v15, s5, v15 -; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: s_and_b32 s5, s40, 0xff ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s60, 24 +; SI-NEXT: s_lshl_b32 s7, s28, 24 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 @@ -24993,15 +25169,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 ; SI-NEXT: v_or_b32_e32 v10, s5, v10 -; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: s_and_b32 s5, s46, 0xff ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s72, 24 +; SI-NEXT: s_lshl_b32 s7, s42, 24 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 @@ -25017,15 +25193,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v29 ; SI-NEXT: v_or_b32_e32 v8, s5, v8 -; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s7, s22, 24 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -25041,15 +25217,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_lshl_b32 s5, s20, 8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v26 ; SI-NEXT: v_or_b32_e32 v6, s5, v6 -; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s40, 24 +; SI-NEXT: s_lshl_b32 s7, s16, 24 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 @@ -25180,10 +25356,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25207,75 +25411,75 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s5, 8 ; VI-NEXT: s_lshr_b32 s58, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s72, s7, 8 +; VI-NEXT: s_lshr_b32 s63, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s77, s9, 8 +; VI-NEXT: s_lshr_b32 s76, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s90, s11, 8 +; VI-NEXT: s_lshr_b32 s89, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s35, s13, 8 +; VI-NEXT: s_lshr_b32 s34, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s48, s15, 8 +; VI-NEXT: s_lshr_b32 s39, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s53, s17, 8 +; VI-NEXT: s_lshr_b32 s52, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s66, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true -; VI-NEXT: v_add_f32_e64 v6, s27, 1.0 -; VI-NEXT: v_add_f32_e64 v5, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s9, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s8, 1.0 ; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 ; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 -; VI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; VI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] -; VI-NEXT: v_add_f32_e64 v10, s23, 1.0 -; VI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_add_f32_e64 v12, s21, 1.0 -; VI-NEXT: v_add_f32_e64 v11, s20, 1.0 -; VI-NEXT: v_add_f32_e64 v4, s29, 1.0 -; VI-NEXT: v_add_f32_e64 v3, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] -; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s16, 1.0 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; VI-NEXT: v_add_f32_e64 v18, s17, 1.0 -; VI-NEXT: v_add_f32_e64 v17, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v18, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s18, 1.0 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -25324,31 +25528,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: .LBB49_3: ; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr35 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 @@ -25376,20 +25580,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v20, s42 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v11, s20 -; VI-NEXT: v_mov_b32_e32 v12, s21 -; VI-NEXT: v_mov_b32_e32 v9, s22 -; VI-NEXT: v_mov_b32_e32 v10, s23 -; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v8, s25 -; VI-NEXT: v_mov_b32_e32 v5, s26 -; VI-NEXT: v_mov_b32_e32 v6, s27 -; VI-NEXT: v_mov_b32_e32 v3, s28 -; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v6, s9 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v19, s67 @@ -25432,14 +25636,14 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v27, s59 ; VI-NEXT: v_mov_b32_e32 v14, s57 ; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v22, s12 -; VI-NEXT: v_mov_b32_e32 v23, s10 -; VI-NEXT: v_mov_b32_e32 v24, s8 -; VI-NEXT: v_mov_b32_e32 v25, s6 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s20 ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v20, s40 -; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: .LBB49_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 @@ -25620,10 +25824,38 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25647,76 +25879,76 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB49_4 ; GFX9-NEXT: .LBB49_2: ; %cmp.true -; GFX9-NEXT: v_add_f32_e64 v6, s27, 1.0 -; GFX9-NEXT: v_add_f32_e64 v5, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 ; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 -; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0 -; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0 -; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0 -; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0 -; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0 -; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s16, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v20, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v19, s18, 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -25766,31 +25998,31 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -25819,20 +26051,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -25875,15 +26107,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB49_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -31508,112 +31740,110 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s4, 3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s21, 16 -; SI-NEXT: s_lshr_b32 s62, s19, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s63 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s40 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s61 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s60 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s12 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s59 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s10 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s58 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s8 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s57 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_mov_b32_e32 v29, s6 -; SI-NEXT: v_mov_b32_e32 v30, s5 -; SI-NEXT: v_mov_b32_e32 v31, s56 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: @@ -32246,111 +32476,139 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; VI-LABEL: bitcast_v32i16_to_v8i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB59_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_3 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB59_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_4: ; VI-NEXT: s_branch .LBB59_2 @@ -32771,108 +33029,136 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v8i64_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v3 +; SI-NEXT: v_readfirstlane_b32 s21, v4 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_readfirstlane_b32 s17, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_add_u32 s10, s18, 3 -; SI-NEXT: s_addc_u32 s11, s19, 0 -; SI-NEXT: s_lshr_b32 s12, s10, 16 -; SI-NEXT: s_lshr_b32 s13, s11, 16 -; SI-NEXT: s_add_u32 s14, s20, 3 -; SI-NEXT: s_addc_u32 s15, s21, 0 -; SI-NEXT: s_lshr_b32 s16, s14, 16 -; SI-NEXT: s_lshr_b32 s17, s15, 16 -; SI-NEXT: s_add_u32 s18, s22, 3 -; SI-NEXT: s_addc_u32 s19, s23, 0 -; SI-NEXT: s_lshr_b32 s20, s18, 16 -; SI-NEXT: s_lshr_b32 s21, s19, 16 -; SI-NEXT: s_add_u32 s22, s24, 3 -; SI-NEXT: s_addc_u32 s23, s25, 0 -; SI-NEXT: s_lshr_b32 s24, s22, 16 -; SI-NEXT: s_lshr_b32 s25, s23, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_u32 s4, s20, 3 +; SI-NEXT: s_addc_u32 s5, s21, 0 +; SI-NEXT: s_lshr_b32 s20, s4, 16 +; SI-NEXT: s_lshr_b32 s21, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s22, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s26, s14, 16 +; SI-NEXT: s_lshr_b32 s27, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s28, s12, 16 +; SI-NEXT: s_lshr_b32 s29, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 ; SI-NEXT: s_add_u32 s6, s6, 3 -; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 @@ -32881,16 +33167,16 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: @@ -34160,7 +34446,35 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s56, v3 +; SI-NEXT: v_readfirstlane_b32 s57, v4 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s60, v7 +; SI-NEXT: v_readfirstlane_b32 s61, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v9 +; SI-NEXT: v_readfirstlane_b32 s63, v10 +; SI-NEXT: v_readfirstlane_b32 s72, v11 +; SI-NEXT: v_readfirstlane_b32 s73, v12 +; SI-NEXT: v_readfirstlane_b32 s74, v13 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v15 +; SI-NEXT: v_readfirstlane_b32 s77, v16 ; SI-NEXT: v_readfirstlane_b32 s78, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s79, v1 @@ -34170,50 +34484,50 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: s_lshl_b32 s7, s79, 16 ; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s28, 16 -; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s27, 16 -; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s26, 16 -; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s25, 16 -; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s24, 16 -; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s23, 16 -; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s22, 16 -; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s21, 16 -; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s20, 16 -; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s19, 16 -; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s18, 16 -; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 -; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s77, 16 +; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s75, 16 +; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s73, 16 +; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s62, 16 +; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s61, 16 +; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s60, 16 +; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s59, 16 +; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s58, 16 +; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s57, 16 +; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_add_u32 s16, s18, 3 -; SI-NEXT: s_addc_u32 s17, s19, 0 -; SI-NEXT: s_add_u32 s18, s20, 3 -; SI-NEXT: s_addc_u32 s19, s21, 0 -; SI-NEXT: s_add_u32 s20, s22, 3 -; SI-NEXT: s_addc_u32 s21, s23, 0 -; SI-NEXT: s_add_u32 s22, s24, 3 -; SI-NEXT: s_addc_u32 s23, s25, 0 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s15, s27, 0 -; SI-NEXT: s_add_u32 s13, s28, 3 -; SI-NEXT: s_addc_u32 s11, s29, 0 +; SI-NEXT: s_add_u32 s4, s56, 3 +; SI-NEXT: s_addc_u32 s5, s57, 0 +; SI-NEXT: s_add_u32 s43, s58, 3 +; SI-NEXT: s_addc_u32 s41, s59, 0 +; SI-NEXT: s_add_u32 s29, s60, 3 +; SI-NEXT: s_addc_u32 s27, s61, 0 +; SI-NEXT: s_add_u32 s25, s62, 3 +; SI-NEXT: s_addc_u32 s23, s63, 0 +; SI-NEXT: s_add_u32 s21, s72, 3 +; SI-NEXT: s_addc_u32 s19, s73, 0 +; SI-NEXT: s_add_u32 s17, s74, 3 +; SI-NEXT: s_addc_u32 s15, s75, 0 +; SI-NEXT: s_add_u32 s13, s76, 3 +; SI-NEXT: s_addc_u32 s11, s77, 0 ; SI-NEXT: s_add_u32 s9, s78, 3 ; SI-NEXT: s_addc_u32 s7, s79, 0 ; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 @@ -34226,51 +34540,51 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_and_b32 s40, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s24, 16 -; SI-NEXT: s_and_b32 s42, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s23, 16 -; SI-NEXT: s_and_b32 s44, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s22, 16 -; SI-NEXT: s_and_b32 s46, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s21, 16 -; SI-NEXT: s_and_b32 s56, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s20, 16 -; SI-NEXT: s_and_b32 s58, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s19, 16 -; SI-NEXT: s_and_b32 s60, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s18, 16 -; SI-NEXT: s_and_b32 s62, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s17, 16 -; SI-NEXT: s_and_b32 s72, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s73, s16, 16 -; SI-NEXT: s_and_b32 s74, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s5, 16 -; SI-NEXT: s_and_b32 s76, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s77, s4, 16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s18, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s20, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s22, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_and_b32 s28, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_and_b32 s40, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_and_b32 s42, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_and_b32 s44, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s5, 16 +; SI-NEXT: s_and_b32 s46, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s77 -; SI-NEXT: v_mov_b32_e32 v1, s76 -; SI-NEXT: v_mov_b32_e32 v2, s75 -; SI-NEXT: v_mov_b32_e32 v3, s74 -; SI-NEXT: v_mov_b32_e32 v4, s73 -; SI-NEXT: v_mov_b32_e32 v5, s72 -; SI-NEXT: v_mov_b32_e32 v6, s63 -; SI-NEXT: v_mov_b32_e32 v7, s62 -; SI-NEXT: v_mov_b32_e32 v8, s61 -; SI-NEXT: v_mov_b32_e32 v9, s60 -; SI-NEXT: v_mov_b32_e32 v10, s59 -; SI-NEXT: v_mov_b32_e32 v11, s58 -; SI-NEXT: v_mov_b32_e32 v12, s57 -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: v_mov_b32_e32 v14, s47 -; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v16, s45 -; SI-NEXT: v_mov_b32_e32 v17, s44 -; SI-NEXT: v_mov_b32_e32 v18, s43 -; SI-NEXT: v_mov_b32_e32 v19, s42 -; SI-NEXT: v_mov_b32_e32 v20, s41 -; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v21, s16 ; SI-NEXT: v_mov_b32_e32 v22, s15 ; SI-NEXT: v_mov_b32_e32 v23, s14 ; SI-NEXT: v_mov_b32_e32 v24, s13 @@ -34283,20 +34597,6 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v31, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 @@ -34305,6 +34605,20 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 @@ -36080,695 +36394,665 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB67_3 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB67_4 -; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_3 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] -; VI-NEXT: v_add_f32_e32 v13, s7, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] -; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: v_mov_b32_e32 v15, v16 -; VI-NEXT: s_branch .LBB67_5 -; VI-NEXT: .LBB67_3: -; VI-NEXT: s_branch .LBB67_2 -; VI-NEXT: .LBB67_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB67_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 ; GFX9-NEXT: .LBB67_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB67_5 -; GFX9-NEXT: .LBB67_3: -; GFX9-NEXT: s_branch .LBB67_2 -; GFX9-NEXT: .LBB67_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB67_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB67_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -39133,345 +39417,373 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s30, 0 -; SI-NEXT: v_writelane_b32 v4, s31, 1 -; SI-NEXT: v_writelane_b32 v4, s34, 2 -; SI-NEXT: v_writelane_b32 v4, s35, 3 -; SI-NEXT: v_writelane_b32 v4, s36, 4 -; SI-NEXT: v_writelane_b32 v4, s37, 5 -; SI-NEXT: v_writelane_b32 v4, s38, 6 -; SI-NEXT: v_writelane_b32 v4, s39, 7 -; SI-NEXT: v_writelane_b32 v4, s48, 8 -; SI-NEXT: v_writelane_b32 v4, s49, 9 -; SI-NEXT: v_writelane_b32 v4, s50, 10 -; SI-NEXT: v_writelane_b32 v4, s51, 11 -; SI-NEXT: v_writelane_b32 v4, s52, 12 -; SI-NEXT: v_writelane_b32 v4, s53, 13 -; SI-NEXT: v_writelane_b32 v4, s54, 14 -; SI-NEXT: v_writelane_b32 v4, s55, 15 -; SI-NEXT: v_writelane_b32 v4, s64, 16 -; SI-NEXT: v_writelane_b32 v4, s65, 17 -; SI-NEXT: v_writelane_b32 v4, s66, 18 -; SI-NEXT: v_writelane_b32 v4, s67, 19 -; SI-NEXT: v_writelane_b32 v4, s68, 20 -; SI-NEXT: v_writelane_b32 v4, s69, 21 -; SI-NEXT: v_writelane_b32 v4, s70, 22 -; SI-NEXT: v_writelane_b32 v4, s71, 23 -; SI-NEXT: v_writelane_b32 v4, s80, 24 -; SI-NEXT: v_writelane_b32 v4, s81, 25 -; SI-NEXT: v_writelane_b32 v4, s82, 26 -; SI-NEXT: v_writelane_b32 v4, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[20:21], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_writelane_b32 v4, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 -; SI-NEXT: s_lshr_b32 s49, s29, 24 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s29, 8 -; SI-NEXT: s_lshr_b32 s52, s27, 24 -; SI-NEXT: s_lshr_b32 s53, s27, 16 -; SI-NEXT: s_lshr_b32 s54, s27, 8 -; SI-NEXT: s_lshr_b32 s55, s25, 24 -; SI-NEXT: s_lshr_b32 s64, s25, 16 -; SI-NEXT: s_lshr_b32 s65, s25, 8 -; SI-NEXT: s_lshr_b32 s66, s23, 24 -; SI-NEXT: s_lshr_b32 s67, s23, 16 -; SI-NEXT: s_lshr_b32 s68, s23, 8 -; SI-NEXT: s_lshr_b32 s69, s21, 24 -; SI-NEXT: s_lshr_b32 s70, s21, 16 -; SI-NEXT: s_lshr_b32 s71, s21, 8 -; SI-NEXT: s_lshr_b32 s80, s19, 24 -; SI-NEXT: s_lshr_b32 s81, s19, 16 -; SI-NEXT: s_lshr_b32 s82, s19, 8 -; SI-NEXT: s_lshr_b32 s83, s17, 24 -; SI-NEXT: s_lshr_b32 s84, s17, 16 -; SI-NEXT: s_lshr_b32 s85, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_lshl_b32 s7, s36, 8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s34, 0xff -; SI-NEXT: s_lshl_b32 s11, s30, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b32 s7, s17, 0xff -; SI-NEXT: s_lshl_b32 s9, s85, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s84, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s83, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_lshl_b32 s7, s94, 8 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s92, 0xff -; SI-NEXT: s_lshl_b32 s11, s90, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: s_and_b32 s7, s19, 0xff -; SI-NEXT: s_lshl_b32 s9, s82, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s81, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s80, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s36, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s21, s34, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: s_lshl_b32 s19, s85, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s84, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s83, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s18, s94, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s92, 0xff +; SI-NEXT: s_lshl_b32 s19, s90, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s82, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s80, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s9, s76, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s72, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s62, 24 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s72, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s62, 24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s21, 0xff -; SI-NEXT: s_lshl_b32 s9, s71, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s70, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s69, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s71, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s88, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s78, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s78, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s74, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s23, 0xff -; SI-NEXT: s_lshl_b32 s9, s68, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s67, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s66, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s68, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s67, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s66, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s9, s60, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s56, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s58, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s56, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s25, 0xff -; SI-NEXT: s_lshl_b32 s9, s65, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s64, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s55, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s65, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s55, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s9, s46, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s44, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s46, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s44, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s42, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s54, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s53, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s52, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_lshl_b32 s10, s52, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s40, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s14, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s40, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s29, 0xff -; SI-NEXT: s_lshl_b32 s9, s51, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s50, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s49, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s51, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s50, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s49, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s7, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s7 -; SI-NEXT: s_and_b32 s7, s8, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s24, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s20, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -39491,38 +39803,38 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s85, v4, 29 -; SI-NEXT: v_readlane_b32 s84, v4, 28 -; SI-NEXT: v_readlane_b32 s83, v4, 27 -; SI-NEXT: v_readlane_b32 s82, v4, 26 -; SI-NEXT: v_readlane_b32 s81, v4, 25 -; SI-NEXT: v_readlane_b32 s80, v4, 24 -; SI-NEXT: v_readlane_b32 s71, v4, 23 -; SI-NEXT: v_readlane_b32 s70, v4, 22 -; SI-NEXT: v_readlane_b32 s69, v4, 21 -; SI-NEXT: v_readlane_b32 s68, v4, 20 -; SI-NEXT: v_readlane_b32 s67, v4, 19 -; SI-NEXT: v_readlane_b32 s66, v4, 18 -; SI-NEXT: v_readlane_b32 s65, v4, 17 -; SI-NEXT: v_readlane_b32 s64, v4, 16 -; SI-NEXT: v_readlane_b32 s55, v4, 15 -; SI-NEXT: v_readlane_b32 s54, v4, 14 -; SI-NEXT: v_readlane_b32 s53, v4, 13 -; SI-NEXT: v_readlane_b32 s52, v4, 12 -; SI-NEXT: v_readlane_b32 s51, v4, 11 -; SI-NEXT: v_readlane_b32 s50, v4, 10 -; SI-NEXT: v_readlane_b32 s49, v4, 9 -; SI-NEXT: v_readlane_b32 s48, v4, 8 -; SI-NEXT: v_readlane_b32 s39, v4, 7 -; SI-NEXT: v_readlane_b32 s38, v4, 6 -; SI-NEXT: v_readlane_b32 s37, v4, 5 -; SI-NEXT: v_readlane_b32 s36, v4, 4 -; SI-NEXT: v_readlane_b32 s35, v4, 3 -; SI-NEXT: v_readlane_b32 s34, v4, 2 -; SI-NEXT: v_readlane_b32 s31, v4, 1 -; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -39570,43 +39882,71 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB69_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -39614,287 +39954,287 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB69_3 ; VI-NEXT: .LBB69_2: ; %cmp.true -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 ; VI-NEXT: s_add_u32 s18, s18, 3 ; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s14, s14, 3 +; VI-NEXT: s_addc_u32 s15, s15, 0 +; VI-NEXT: s_add_u32 s12, s12, 3 +; VI-NEXT: s_addc_u32 s13, s13, 0 +; VI-NEXT: s_add_u32 s10, s10, 3 +; VI-NEXT: s_addc_u32 s11, s11, 0 +; VI-NEXT: s_add_u32 s8, s8, 3 +; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s4, s4, 3 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB69_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -39915,28 +40255,28 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -39961,31 +40301,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -39995,28 +40335,56 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -40024,275 +40392,275 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB69_3 ; GFX9-NEXT: .LBB69_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s16, s16, 3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_add_u32 s18, s18, 3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 -; GFX9-NEXT: s_add_u32 s20, s20, 3 -; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_add_u32 s22, s22, 3 -; GFX9-NEXT: s_addc_u32 s23, s23, 0 -; GFX9-NEXT: s_add_u32 s24, s24, 3 -; GFX9-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s4, s4, 3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s59, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s29, 8 -; GFX9-NEXT: s_lshr_b32 s72, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 8 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: s_lshr_b32 s90, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s94, s23, 8 -; GFX9-NEXT: s_lshr_b32 s95, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s35, s21, 8 -; GFX9-NEXT: s_lshr_b32 s36, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s48, s19, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s7, 8 +; GFX9-NEXT: s_lshr_b32 s72, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 8 +; GFX9-NEXT: s_lshr_b32 s90, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s94, s13, 8 +; GFX9-NEXT: s_lshr_b32 s95, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s35, s15, 8 +; GFX9-NEXT: s_lshr_b32 s36, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s48, s17, 8 +; GFX9-NEXT: s_lshr_b32 s49, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: .LBB69_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s51, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s51, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s50, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s48, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s37, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s36, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s37, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s35, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s34, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s31, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s35, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s31, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s95, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s30, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s94, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s93, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s92, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s94, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s92, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s91, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s91, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s89, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s79, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s89, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s79, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s78, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s78, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s76, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s75, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s74, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s72, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s62, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s61, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s60, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s59, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -40310,24 +40678,24 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -40352,31 +40720,31 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr57 ; GFX9-NEXT: ; implicit-def: $sgpr56 @@ -45916,111 +46284,139 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; VI-LABEL: bitcast_v32i16_to_v8f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: v_readfirstlane_b32 s6, v3 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s16, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s18, v15 +; VI-NEXT: v_readfirstlane_b32 s19, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v1 ; VI-NEXT: s_cbranch_scc0 .LBB75_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB75_3 ; VI-NEXT: .LBB75_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s7, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s6, 3 -; VI-NEXT: s_add_i32 s7, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s29, 3 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s28, 3 -; VI-NEXT: s_add_i32 s29, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s27, 3 -; VI-NEXT: s_add_i32 s28, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s26, 3 -; VI-NEXT: s_add_i32 s27, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s25, 3 -; VI-NEXT: s_add_i32 s26, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s24, 3 -; VI-NEXT: s_add_i32 s25, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s23, 3 -; VI-NEXT: s_add_i32 s24, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s22, 3 -; VI-NEXT: s_add_i32 s23, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s21, 3 -; VI-NEXT: s_add_i32 s22, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s20, 3 -; VI-NEXT: s_add_i32 s21, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s19, 3 -; VI-NEXT: s_add_i32 s20, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s18, 3 -; VI-NEXT: s_add_i32 s19, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s17, 3 -; VI-NEXT: s_add_i32 s18, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_add_i32 s17, s4, 0x30000 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 ; VI-NEXT: .LBB75_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s18 +; VI-NEXT: v_mov_b32_e32 v13, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB75_4: ; VI-NEXT: s_branch .LBB75_2 @@ -46394,102 +46790,114 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: s_lshr_b32 s6, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: s_lshr_b32 s6, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_add_f64 v[31:32], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -46497,19 +46905,21 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -49580,695 +49990,665 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v10, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: s_cbranch_execnz .LBB83_3 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s5, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_add_f32_e32 v0, s4, v16 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; VI-NEXT: v_add_f32_e32 v0, s6, v16 -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[17:18] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[18:19] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[19:20] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[20:21] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[21:22] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_lshrrev_b64 v[21:22], 16, v[21:22] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[22:23] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[23:24] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s6, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s6, v16 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] -; VI-NEXT: v_add_f32_e32 v13, s7, v16 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s6, s31, 16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] -; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[24:25] +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: v_mov_b32_e32 v3, v22 +; VI-NEXT: v_mov_b32_e32 v5, v21 +; VI-NEXT: v_mov_b32_e32 v7, v20 +; VI-NEXT: v_mov_b32_e32 v9, v19 +; VI-NEXT: v_mov_b32_e32 v11, v18 +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: v_mov_b32_e32 v15, v16 -; VI-NEXT: s_branch .LBB83_5 -; VI-NEXT: .LBB83_3: -; VI-NEXT: s_branch .LBB83_2 -; VI-NEXT: .LBB83_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB83_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 ; GFX9-NEXT: .LBB83_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc ; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_and_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v17, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v14 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v14, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v17, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v13 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v17, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v17, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v11 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v17, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v10 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v17, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v9 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v9, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v17, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v8 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v17, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v5 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v17, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v17, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v3 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v2 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB83_5 -; GFX9-NEXT: .LBB83_3: -; GFX9-NEXT: s_branch .LBB83_2 -; GFX9-NEXT: .LBB83_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_mov_b32_e32 v8, s24 -; GFX9-NEXT: v_mov_b32_e32 v9, s25 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB83_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v17, 16, v0 +; GFX9-NEXT: .LBB83_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -52616,10 +52996,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s83, 27 ; SI-NEXT: v_writelane_b32 v40, s84, 28 ; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[20:21], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: v_writelane_b32 v40, s87, 31 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 @@ -52627,62 +53035,62 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s48, s5, 24 ; SI-NEXT: s_lshr_b32 s49, s5, 16 ; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s29, 24 -; SI-NEXT: s_lshr_b32 s52, s29, 16 -; SI-NEXT: s_lshr_b32 s53, s29, 8 -; SI-NEXT: s_lshr_b32 s54, s27, 24 -; SI-NEXT: s_lshr_b32 s55, s27, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s66, s25, 16 -; SI-NEXT: s_lshr_b32 s67, s25, 8 -; SI-NEXT: s_lshr_b32 s68, s23, 24 -; SI-NEXT: s_lshr_b32 s69, s23, 16 -; SI-NEXT: s_lshr_b32 s70, s23, 8 -; SI-NEXT: s_lshr_b32 s71, s21, 24 -; SI-NEXT: s_lshr_b32 s80, s21, 16 -; SI-NEXT: s_lshr_b32 s81, s21, 8 -; SI-NEXT: s_lshr_b32 s82, s19, 24 -; SI-NEXT: s_lshr_b32 s83, s19, 16 -; SI-NEXT: s_lshr_b32 s84, s19, 8 -; SI-NEXT: s_lshr_b32 s85, s17, 24 -; SI-NEXT: s_lshr_b32 s86, s17, 16 -; SI-NEXT: s_lshr_b32 s87, s17, 8 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s51, s7, 24 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 8 +; SI-NEXT: s_lshr_b32 s54, s9, 24 +; SI-NEXT: s_lshr_b32 s55, s9, 16 +; SI-NEXT: s_lshr_b32 s64, s9, 8 +; SI-NEXT: s_lshr_b32 s65, s11, 24 +; SI-NEXT: s_lshr_b32 s66, s11, 16 +; SI-NEXT: s_lshr_b32 s67, s11, 8 +; SI-NEXT: s_lshr_b32 s68, s13, 24 +; SI-NEXT: s_lshr_b32 s69, s13, 16 +; SI-NEXT: s_lshr_b32 s70, s13, 8 +; SI-NEXT: s_lshr_b32 s71, s15, 24 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s15, 8 +; SI-NEXT: s_lshr_b32 s82, s17, 24 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s84, s17, 8 +; SI-NEXT: s_lshr_b32 s85, s19, 24 +; SI-NEXT: s_lshr_b32 s86, s19, 16 +; SI-NEXT: s_lshr_b32 s87, s19, 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 ; SI-NEXT: v_lshr_b64 v[48:49], v[28:29], 24 ; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[32:33], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 ; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[49:50], v[28:29], 16 @@ -52700,13 +53108,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[20:21], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16 -; SI-NEXT: v_readfirstlane_b32 s17, v33 -; SI-NEXT: v_readfirstlane_b32 s19, v29 -; SI-NEXT: v_readfirstlane_b32 s21, v21 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v6 -; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v33 +; SI-NEXT: v_readfirstlane_b32 s17, v29 +; SI-NEXT: v_readfirstlane_b32 s15, v21 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 8 ; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 24 @@ -52717,27 +53125,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s48, s5, 24 ; SI-NEXT: s_lshr_b32 s49, s5, 16 ; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s29, 24 -; SI-NEXT: s_lshr_b32 s52, s29, 16 -; SI-NEXT: s_lshr_b32 s53, s29, 8 -; SI-NEXT: s_lshr_b32 s54, s27, 24 -; SI-NEXT: s_lshr_b32 s55, s27, 16 -; SI-NEXT: s_lshr_b32 s64, s27, 8 -; SI-NEXT: s_lshr_b32 s65, s25, 24 -; SI-NEXT: s_lshr_b32 s66, s25, 16 -; SI-NEXT: s_lshr_b32 s67, s25, 8 -; SI-NEXT: s_lshr_b32 s68, s23, 24 -; SI-NEXT: s_lshr_b32 s69, s23, 16 -; SI-NEXT: s_lshr_b32 s70, s23, 8 -; SI-NEXT: s_lshr_b32 s71, s21, 24 -; SI-NEXT: s_lshr_b32 s80, s21, 16 -; SI-NEXT: s_lshr_b32 s81, s21, 8 -; SI-NEXT: s_lshr_b32 s82, s19, 24 -; SI-NEXT: s_lshr_b32 s83, s19, 16 -; SI-NEXT: s_lshr_b32 s84, s19, 8 -; SI-NEXT: s_lshr_b32 s85, s17, 24 -; SI-NEXT: s_lshr_b32 s86, s17, 16 -; SI-NEXT: s_lshr_b32 s87, s17, 8 +; SI-NEXT: s_lshr_b32 s51, s7, 24 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 8 +; SI-NEXT: s_lshr_b32 s54, s9, 24 +; SI-NEXT: s_lshr_b32 s55, s9, 16 +; SI-NEXT: s_lshr_b32 s64, s9, 8 +; SI-NEXT: s_lshr_b32 s65, s11, 24 +; SI-NEXT: s_lshr_b32 s66, s11, 16 +; SI-NEXT: s_lshr_b32 s67, s11, 8 +; SI-NEXT: s_lshr_b32 s68, s13, 24 +; SI-NEXT: s_lshr_b32 s69, s13, 16 +; SI-NEXT: s_lshr_b32 s70, s13, 8 +; SI-NEXT: s_lshr_b32 s71, s15, 24 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s15, 8 +; SI-NEXT: s_lshr_b32 s82, s17, 24 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s84, s17, 8 +; SI-NEXT: s_lshr_b32 s85, s19, 24 +; SI-NEXT: s_lshr_b32 s86, s19, 16 +; SI-NEXT: s_lshr_b32 s87, s19, 8 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: ; SI-NEXT: ; implicit-def: $sgpr74 @@ -52753,8 +53161,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr71 @@ -52785,18 +53193,18 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v32, s16 -; SI-NEXT: v_mov_b32_e32 v28, s18 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v13, s12 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_mov_b32_e32 v53, s74 ; SI-NEXT: v_mov_b32_e32 v52, s62 @@ -52805,8 +53213,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v49, s44 ; SI-NEXT: v_mov_b32_e32 v48, s42 ; SI-NEXT: v_mov_b32_e32 v38, s40 -; SI-NEXT: v_mov_b32_e32 v37, s12 -; SI-NEXT: v_mov_b32_e32 v36, s8 +; SI-NEXT: v_mov_b32_e32 v37, s26 +; SI-NEXT: v_mov_b32_e32 v36, s22 ; SI-NEXT: v_mov_b32_e32 v35, s38 ; SI-NEXT: v_mov_b32_e32 v34, s36 ; SI-NEXT: v_mov_b32_e32 v30, s34 @@ -52819,13 +53227,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v17, s72 ; SI-NEXT: v_mov_b32_e32 v16, s60 ; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s20 ; SI-NEXT: .LBB85_5: ; %end ; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 -; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_and_b32 s4, s19, 0xff ; SI-NEXT: s_lshl_b32 s6, s87, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v52 @@ -52834,11 +53242,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s85, 24 +; SI-NEXT: s_lshl_b32 s8, s85, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen @@ -52849,7 +53257,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v50 -; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s6, s84, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v49 @@ -52858,11 +53266,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s82, 24 +; SI-NEXT: s_lshl_b32 s8, s82, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52874,7 +53282,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v38 -; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s6, s81, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v37 @@ -52883,11 +53291,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s71, 24 +; SI-NEXT: s_lshl_b32 s8, s71, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52899,7 +53307,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 -; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_and_b32 s4, s13, 0xff ; SI-NEXT: s_lshl_b32 s6, s70, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 @@ -52908,11 +53316,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s68, 24 +; SI-NEXT: s_lshl_b32 s8, s68, 24 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52924,7 +53332,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 -; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_and_b32 s4, s11, 0xff ; SI-NEXT: s_lshl_b32 s6, s67, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v25 @@ -52933,11 +53341,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s65, 24 +; SI-NEXT: s_lshl_b32 s8, s65, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52949,7 +53357,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s6, s64, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v22 @@ -52958,11 +53366,11 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v18 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s54, 24 +; SI-NEXT: s_lshl_b32 s8, s54, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -52973,7 +53381,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s6, s53, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 @@ -53083,10 +53491,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_writelane_b32 v40, s55, 15 ; VI-NEXT: v_writelane_b32 v40, s64, 16 ; VI-NEXT: v_writelane_b32 v40, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v40, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: v_writelane_b32 v40, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB85_3 @@ -53096,59 +53532,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s37, s4, 16 ; VI-NEXT: s_lshr_b32 s36, s4, 8 -; VI-NEXT: s_lshr_b32 s59, s29, 24 -; VI-NEXT: s_lshr_b32 s60, s29, 16 -; VI-NEXT: s_lshr_b32 s61, s29, 8 -; VI-NEXT: s_lshr_b32 s39, s28, 16 -; VI-NEXT: s_lshr_b32 s38, s28, 8 -; VI-NEXT: s_lshr_b32 s62, s27, 24 -; VI-NEXT: s_lshr_b32 s63, s27, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 8 -; VI-NEXT: s_lshr_b32 s49, s26, 16 -; VI-NEXT: s_lshr_b32 s48, s26, 8 -; VI-NEXT: s_lshr_b32 s73, s25, 24 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s25, 8 -; VI-NEXT: s_lshr_b32 s51, s24, 16 -; VI-NEXT: s_lshr_b32 s50, s24, 8 -; VI-NEXT: s_lshr_b32 s76, s23, 24 -; VI-NEXT: s_lshr_b32 s77, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 -; VI-NEXT: s_lshr_b32 s53, s22, 16 -; VI-NEXT: s_lshr_b32 s52, s22, 8 -; VI-NEXT: s_lshr_b32 s79, s21, 24 -; VI-NEXT: s_lshr_b32 s88, s21, 16 -; VI-NEXT: s_lshr_b32 s89, s21, 8 -; VI-NEXT: s_lshr_b32 s55, s20, 16 -; VI-NEXT: s_lshr_b32 s54, s20, 8 -; VI-NEXT: s_lshr_b32 s90, s19, 24 -; VI-NEXT: s_lshr_b32 s91, s19, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s64, s18, 8 -; VI-NEXT: s_lshr_b32 s31, s17, 24 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s17, 8 -; VI-NEXT: s_lshr_b32 s67, s16, 16 -; VI-NEXT: s_lshr_b32 s66, s16, 8 +; VI-NEXT: s_lshr_b32 s59, s7, 24 +; VI-NEXT: s_lshr_b32 s60, s7, 16 +; VI-NEXT: s_lshr_b32 s61, s7, 8 +; VI-NEXT: s_lshr_b32 s39, s6, 16 +; VI-NEXT: s_lshr_b32 s38, s6, 8 +; VI-NEXT: s_lshr_b32 s62, s9, 24 +; VI-NEXT: s_lshr_b32 s63, s9, 16 +; VI-NEXT: s_lshr_b32 s72, s9, 8 +; VI-NEXT: s_lshr_b32 s49, s8, 16 +; VI-NEXT: s_lshr_b32 s48, s8, 8 +; VI-NEXT: s_lshr_b32 s73, s11, 24 +; VI-NEXT: s_lshr_b32 s74, s11, 16 +; VI-NEXT: s_lshr_b32 s75, s11, 8 +; VI-NEXT: s_lshr_b32 s51, s10, 16 +; VI-NEXT: s_lshr_b32 s50, s10, 8 +; VI-NEXT: s_lshr_b32 s76, s13, 24 +; VI-NEXT: s_lshr_b32 s77, s13, 16 +; VI-NEXT: s_lshr_b32 s78, s13, 8 +; VI-NEXT: s_lshr_b32 s53, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s79, s15, 24 +; VI-NEXT: s_lshr_b32 s88, s15, 16 +; VI-NEXT: s_lshr_b32 s89, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s54, s14, 8 +; VI-NEXT: s_lshr_b32 s90, s17, 24 +; VI-NEXT: s_lshr_b32 s91, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s64, s16, 8 +; VI-NEXT: s_lshr_b32 s31, s19, 24 +; VI-NEXT: s_lshr_b32 s34, s19, 16 +; VI-NEXT: s_lshr_b32 s35, s19, 8 +; VI-NEXT: s_lshr_b32 s67, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] @@ -53156,13 +53592,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; VI-NEXT: v_readfirstlane_b32 s17, v14 -; VI-NEXT: v_readfirstlane_b32 s19, v10 -; VI-NEXT: v_readfirstlane_b32 s21, v16 -; VI-NEXT: v_readfirstlane_b32 s23, v12 -; VI-NEXT: v_readfirstlane_b32 s25, v8 -; VI-NEXT: v_readfirstlane_b32 s27, v6 -; VI-NEXT: v_readfirstlane_b32 s29, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v16 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s7, v4 ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -53170,70 +53606,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; VI-NEXT: s_lshr_b32 s59, s29, 24 -; VI-NEXT: s_lshr_b32 s60, s29, 16 -; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: s_lshr_b32 s59, s7, 24 +; VI-NEXT: s_lshr_b32 s60, s7, 16 +; VI-NEXT: s_lshr_b32 s61, s7, 8 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; VI-NEXT: s_lshr_b32 s62, s27, 24 -; VI-NEXT: s_lshr_b32 s63, s27, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: s_lshr_b32 s62, s9, 24 +; VI-NEXT: s_lshr_b32 s63, s9, 16 +; VI-NEXT: s_lshr_b32 s72, s9, 8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_lshr_b32 s73, s25, 24 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: s_lshr_b32 s73, s11, 24 +; VI-NEXT: s_lshr_b32 s74, s11, 16 +; VI-NEXT: s_lshr_b32 s75, s11, 8 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; VI-NEXT: s_lshr_b32 s76, s23, 24 -; VI-NEXT: s_lshr_b32 s77, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s76, s13, 24 +; VI-NEXT: s_lshr_b32 s77, s13, 16 +; VI-NEXT: s_lshr_b32 s78, s13, 8 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; VI-NEXT: s_lshr_b32 s79, s21, 24 -; VI-NEXT: s_lshr_b32 s88, s21, 16 -; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: s_lshr_b32 s79, s15, 24 +; VI-NEXT: s_lshr_b32 s88, s15, 16 +; VI-NEXT: s_lshr_b32 s89, s15, 8 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 -; VI-NEXT: s_lshr_b32 s90, s19, 24 -; VI-NEXT: s_lshr_b32 s91, s19, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: s_lshr_b32 s90, s17, 24 +; VI-NEXT: s_lshr_b32 s91, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 -; VI-NEXT: s_lshr_b32 s31, s17, 24 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: s_lshr_b32 s31, s19, 24 +; VI-NEXT: s_lshr_b32 s34, s19, 16 +; VI-NEXT: s_lshr_b32 s35, s19, 8 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 ; VI-NEXT: s_branch .LBB85_5 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr35 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 @@ -53257,8 +53693,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB85_2 ; VI-NEXT: .LBB85_4: -; VI-NEXT: v_mov_b32_e32 v13, s16 -; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v9, s16 ; VI-NEXT: v_mov_b32_e32 v48, s67 ; VI-NEXT: v_mov_b32_e32 v49, s66 ; VI-NEXT: v_mov_b32_e32 v38, s65 @@ -53275,27 +53711,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v29, s38 ; VI-NEXT: v_mov_b32_e32 v26, s37 ; VI-NEXT: v_mov_b32_e32 v27, s36 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v7, s24 -; VI-NEXT: v_mov_b32_e32 v5, s26 -; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v23, s22 +; VI-NEXT: v_mov_b32_e32 v22, s24 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v20, s28 ; VI-NEXT: v_mov_b32_e32 v19, s40 ; VI-NEXT: v_mov_b32_e32 v18, s42 ; VI-NEXT: v_mov_b32_e32 v17, s44 ; VI-NEXT: .LBB85_5: ; %end -; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s6, s35, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s34, 0xff -; VI-NEXT: s_lshl_b32 s7, s31, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshl_b32 s8, s31, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53305,17 +53741,17 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s6, s30, 8 ; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s91, 0xff -; VI-NEXT: s_lshl_b32 s7, s90, 8 +; VI-NEXT: s_lshl_b32 s8, s90, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53325,16 +53761,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_and_b32 s4, s15, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff -; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: s_lshl_b32 s8, s79, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53344,16 +53780,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_and_b32 s4, s13, 0xff ; VI-NEXT: s_lshl_b32 s6, s78, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s77, 0xff -; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_lshl_b32 s8, s76, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53363,16 +53799,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_and_b32 s4, s11, 0xff ; VI-NEXT: s_lshl_b32 s6, s75, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s74, 0xff -; VI-NEXT: s_lshl_b32 s7, s73, 8 +; VI-NEXT: s_lshl_b32 s8, s73, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53382,16 +53818,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s27, 0xff +; VI-NEXT: s_and_b32 s4, s9, 0xff ; VI-NEXT: s_lshl_b32 s6, s72, 8 ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s63, 0xff -; VI-NEXT: s_lshl_b32 s7, s62, 8 +; VI-NEXT: s_lshl_b32 s8, s62, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53402,7 +53838,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_and_b32 s4, s29, 0xff +; VI-NEXT: s_and_b32 s4, s7, 0xff ; VI-NEXT: s_lshl_b32 s6, s61, 8 ; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 @@ -53487,10 +53923,38 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v40, s51, 11 ; GFX9-NEXT: v_writelane_b32 v40, s52, 12 ; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: v_writelane_b32 v40, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 @@ -53500,59 +53964,59 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: s_lshr_b32 s31, s4, 16 ; GFX9-NEXT: s_lshr_b32 s30, s4, 8 -; GFX9-NEXT: s_lshr_b32 s59, s29, 24 -; GFX9-NEXT: s_lshr_b32 s60, s29, 16 -; GFX9-NEXT: s_lshr_b32 s61, s29, 8 -; GFX9-NEXT: s_lshr_b32 s35, s28, 16 -; GFX9-NEXT: s_lshr_b32 s34, s28, 8 -; GFX9-NEXT: s_lshr_b32 s62, s27, 24 -; GFX9-NEXT: s_lshr_b32 s63, s27, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 8 -; GFX9-NEXT: s_lshr_b32 s37, s26, 16 -; GFX9-NEXT: s_lshr_b32 s36, s26, 8 -; GFX9-NEXT: s_lshr_b32 s73, s25, 24 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s25, 8 -; GFX9-NEXT: s_lshr_b32 s39, s24, 16 -; GFX9-NEXT: s_lshr_b32 s38, s24, 8 -; GFX9-NEXT: s_lshr_b32 s76, s23, 24 -; GFX9-NEXT: s_lshr_b32 s77, s23, 16 -; GFX9-NEXT: s_lshr_b32 s78, s23, 8 -; GFX9-NEXT: s_lshr_b32 s49, s22, 16 -; GFX9-NEXT: s_lshr_b32 s48, s22, 8 -; GFX9-NEXT: s_lshr_b32 s79, s21, 24 -; GFX9-NEXT: s_lshr_b32 s88, s21, 16 -; GFX9-NEXT: s_lshr_b32 s89, s21, 8 -; GFX9-NEXT: s_lshr_b32 s51, s20, 16 -; GFX9-NEXT: s_lshr_b32 s50, s20, 8 -; GFX9-NEXT: s_lshr_b32 s90, s19, 24 -; GFX9-NEXT: s_lshr_b32 s91, s19, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 8 -; GFX9-NEXT: s_lshr_b32 s53, s18, 16 -; GFX9-NEXT: s_lshr_b32 s52, s18, 8 -; GFX9-NEXT: s_lshr_b32 s93, s17, 24 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s17, 8 -; GFX9-NEXT: s_lshr_b32 s55, s16, 16 -; GFX9-NEXT: s_lshr_b32 s54, s16, 8 +; GFX9-NEXT: s_lshr_b32 s59, s7, 24 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s7, 8 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 +; GFX9-NEXT: s_lshr_b32 s34, s6, 8 +; GFX9-NEXT: s_lshr_b32 s62, s9, 24 +; GFX9-NEXT: s_lshr_b32 s63, s9, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 8 +; GFX9-NEXT: s_lshr_b32 s37, s8, 16 +; GFX9-NEXT: s_lshr_b32 s36, s8, 8 +; GFX9-NEXT: s_lshr_b32 s73, s11, 24 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s11, 8 +; GFX9-NEXT: s_lshr_b32 s39, s10, 16 +; GFX9-NEXT: s_lshr_b32 s38, s10, 8 +; GFX9-NEXT: s_lshr_b32 s76, s13, 24 +; GFX9-NEXT: s_lshr_b32 s77, s13, 16 +; GFX9-NEXT: s_lshr_b32 s78, s13, 8 +; GFX9-NEXT: s_lshr_b32 s49, s12, 16 +; GFX9-NEXT: s_lshr_b32 s48, s12, 8 +; GFX9-NEXT: s_lshr_b32 s79, s15, 24 +; GFX9-NEXT: s_lshr_b32 s88, s15, 16 +; GFX9-NEXT: s_lshr_b32 s89, s15, 8 +; GFX9-NEXT: s_lshr_b32 s51, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s90, s17, 24 +; GFX9-NEXT: s_lshr_b32 s91, s17, 16 +; GFX9-NEXT: s_lshr_b32 s92, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s52, s16, 8 +; GFX9-NEXT: s_lshr_b32 s93, s19, 24 +; GFX9-NEXT: s_lshr_b32 s94, s19, 16 +; GFX9-NEXT: s_lshr_b32 s95, s19, 8 +; GFX9-NEXT: s_lshr_b32 s55, s18, 16 +; GFX9-NEXT: s_lshr_b32 s54, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] @@ -53560,13 +54024,13 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; GFX9-NEXT: v_readfirstlane_b32 s17, v14 -; GFX9-NEXT: v_readfirstlane_b32 s19, v12 -; GFX9-NEXT: v_readfirstlane_b32 s21, v16 -; GFX9-NEXT: v_readfirstlane_b32 s23, v10 -; GFX9-NEXT: v_readfirstlane_b32 s25, v8 -; GFX9-NEXT: v_readfirstlane_b32 s27, v6 -; GFX9-NEXT: v_readfirstlane_b32 s29, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v14 +; GFX9-NEXT: v_readfirstlane_b32 s17, v12 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v4 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -53574,70 +54038,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX9-NEXT: s_lshr_b32 s59, s29, 24 -; GFX9-NEXT: s_lshr_b32 s60, s29, 16 -; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: s_lshr_b32 s59, s7, 24 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s7, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX9-NEXT: s_lshr_b32 s62, s27, 24 -; GFX9-NEXT: s_lshr_b32 s63, s27, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: s_lshr_b32 s62, s9, 24 +; GFX9-NEXT: s_lshr_b32 s63, s9, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: s_lshr_b32 s73, s25, 24 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: s_lshr_b32 s73, s11, 24 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s11, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; GFX9-NEXT: s_lshr_b32 s76, s23, 24 -; GFX9-NEXT: s_lshr_b32 s77, s23, 16 -; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: s_lshr_b32 s76, s13, 24 +; GFX9-NEXT: s_lshr_b32 s77, s13, 16 +; GFX9-NEXT: s_lshr_b32 s78, s13, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 -; GFX9-NEXT: s_lshr_b32 s79, s21, 24 -; GFX9-NEXT: s_lshr_b32 s88, s21, 16 -; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: s_lshr_b32 s79, s15, 24 +; GFX9-NEXT: s_lshr_b32 s88, s15, 16 +; GFX9-NEXT: s_lshr_b32 s89, s15, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 -; GFX9-NEXT: s_lshr_b32 s90, s19, 24 -; GFX9-NEXT: s_lshr_b32 s91, s19, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: s_lshr_b32 s90, s17, 24 +; GFX9-NEXT: s_lshr_b32 s91, s17, 16 +; GFX9-NEXT: s_lshr_b32 s92, s17, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 -; GFX9-NEXT: s_lshr_b32 s93, s17, 24 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: s_lshr_b32 s93, s19, 24 +; GFX9-NEXT: s_lshr_b32 s94, s19, 16 +; GFX9-NEXT: s_lshr_b32 s95, s19, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 ; GFX9-NEXT: s_branch .LBB85_5 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr77 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr73 @@ -53661,8 +54125,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB85_2 ; GFX9-NEXT: .LBB85_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s16 -; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 ; GFX9-NEXT: v_mov_b32_e32 v39, s55 ; GFX9-NEXT: v_mov_b32_e32 v49, s54 ; GFX9-NEXT: v_mov_b32_e32 v37, s53 @@ -53679,29 +54143,29 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s34 ; GFX9-NEXT: v_mov_b32_e32 v26, s31 ; GFX9-NEXT: v_mov_b32_e32 v27, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v24, s6 -; GFX9-NEXT: v_mov_b32_e32 v23, s8 -; GFX9-NEXT: v_mov_b32_e32 v22, s10 -; GFX9-NEXT: v_mov_b32_e32 v21, s12 -; GFX9-NEXT: v_mov_b32_e32 v20, s14 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 +; GFX9-NEXT: v_mov_b32_e32 v22, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 ; GFX9-NEXT: v_mov_b32_e32 v19, s40 ; GFX9-NEXT: v_mov_b32_e32 v18, s42 ; GFX9-NEXT: v_mov_b32_e32 v17, s44 ; GFX9-NEXT: .LBB85_5: ; %end -; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_and_b32 s4, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s95, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s93, 8 +; GFX9-NEXT: s_lshl_b32 s8, s93, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53710,15 +54174,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s92, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s91, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s90, 8 +; GFX9-NEXT: s_lshl_b32 s8, s90, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53727,15 +54191,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_and_b32 s4, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s89, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s79, 8 +; GFX9-NEXT: s_lshl_b32 s8, s79, 8 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53744,15 +54208,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_and_b32 s4, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s78, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_lshl_b32 s8, s76, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53761,15 +54225,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_and_b32 s4, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s75, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s74, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: s_lshl_b32 s8, s73, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53778,15 +54242,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s27, 0xff +; GFX9-NEXT: s_and_b32 s4, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s72, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s63, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: s_lshl_b32 s8, s62, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s6, s6, s8 ; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -53795,7 +54259,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_and_b32 s4, s29, 0xff +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s61, 8 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 @@ -59063,111 +59527,139 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i ; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s21, v3 +; VI-NEXT: v_readfirstlane_b32 s20, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s18, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s16, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s12, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s10, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_cbranch_scc0 .LBB89_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB89_3 ; VI-NEXT: .LBB89_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s22, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s23, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s24, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s25, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s26, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s27, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s28, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s29, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s40, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s41, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s42, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s45, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s9, s45, s9 ; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s7, s43, s7 +; VI-NEXT: s_or_b32 s8, s42, s8 +; VI-NEXT: s_or_b32 s10, s41, s10 +; VI-NEXT: s_or_b32 s11, s40, s11 +; VI-NEXT: s_or_b32 s12, s29, s12 +; VI-NEXT: s_or_b32 s13, s28, s13 +; VI-NEXT: s_or_b32 s14, s27, s14 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_or_b32 s16, s25, s16 +; VI-NEXT: s_or_b32 s17, s24, s17 +; VI-NEXT: s_or_b32 s18, s23, s18 +; VI-NEXT: s_or_b32 s19, s22, s19 +; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 ; VI-NEXT: .LBB89_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s21 +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v15, s9 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB89_4: ; VI-NEXT: s_branch .LBB89_2 @@ -60524,111 +61016,139 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 +; VI-NEXT: v_mov_b32_e32 v13, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s21, v3 +; VI-NEXT: v_readfirstlane_b32 s20, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s18, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s16, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s12, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s10, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 ; VI-NEXT: v_readfirstlane_b32 s6, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_cbranch_scc0 .LBB93_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB93_3 ; VI-NEXT: .LBB93_2: ; %cmp.true -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: s_add_i32 s5, s16, 3 -; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 -; VI-NEXT: s_add_i32 s11, s18, 3 -; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 -; VI-NEXT: s_add_i32 s13, s19, 3 -; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 -; VI-NEXT: s_add_i32 s15, s20, 3 -; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 -; VI-NEXT: s_add_i32 s17, s21, 3 -; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 -; VI-NEXT: s_add_i32 s19, s22, 3 -; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 -; VI-NEXT: s_add_i32 s21, s23, 3 -; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 -; VI-NEXT: s_add_i32 s23, s24, 3 -; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 -; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s22, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s23, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s24, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s25, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s26, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s27, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s28, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s29, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s40, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s41, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s42, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 ; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s45, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s29, s29, 0xffff -; VI-NEXT: s_and_b32 s28, s28, 0xffff -; VI-NEXT: s_and_b32 s27, s27, 0xffff -; VI-NEXT: s_and_b32 s26, s26, 0xffff -; VI-NEXT: s_and_b32 s25, s25, 0xffff -; VI-NEXT: s_and_b32 s23, s23, 0xffff -; VI-NEXT: s_and_b32 s21, s21, 0xffff -; VI-NEXT: s_and_b32 s19, s19, 0xffff -; VI-NEXT: s_and_b32 s17, s17, 0xffff -; VI-NEXT: s_and_b32 s15, s15, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s9, s45, s9 ; VI-NEXT: s_or_b32 s6, s44, s6 -; VI-NEXT: s_or_b32 s29, s43, s29 -; VI-NEXT: s_or_b32 s28, s42, s28 -; VI-NEXT: s_or_b32 s27, s41, s27 -; VI-NEXT: s_or_b32 s26, s40, s26 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s7, s43, s7 +; VI-NEXT: s_or_b32 s8, s42, s8 +; VI-NEXT: s_or_b32 s10, s41, s10 +; VI-NEXT: s_or_b32 s11, s40, s11 +; VI-NEXT: s_or_b32 s12, s29, s12 +; VI-NEXT: s_or_b32 s13, s28, s13 +; VI-NEXT: s_or_b32 s14, s27, s14 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_or_b32 s16, s25, s16 +; VI-NEXT: s_or_b32 s17, s24, s17 +; VI-NEXT: s_or_b32 s18, s23, s18 +; VI-NEXT: s_or_b32 s19, s22, s19 +; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 -; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 -; VI-NEXT: s_add_i32 s25, s24, 0x30000 -; VI-NEXT: s_add_i32 s24, s22, 0x30000 -; VI-NEXT: s_add_i32 s23, s20, 0x30000 -; VI-NEXT: s_add_i32 s22, s18, 0x30000 -; VI-NEXT: s_add_i32 s21, s16, 0x30000 -; VI-NEXT: s_add_i32 s20, s14, 0x30000 -; VI-NEXT: s_add_i32 s19, s12, 0x30000 -; VI-NEXT: s_add_i32 s18, s10, 0x30000 -; VI-NEXT: s_add_i32 s17, s8, 0x30000 -; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 ; VI-NEXT: .LBB93_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s21 +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s8 +; VI-NEXT: v_mov_b32_e32 v13, s7 ; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v15, s9 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB93_4: ; VI-NEXT: s_branch .LBB93_2 @@ -62428,654 +62948,340 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB95_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: s_cbranch_execnz .LBB95_3 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v16 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v5, s7, v16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v3, s5, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s5, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s5, v16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v9, s5, v16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v11, s5, v16 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v13, s5, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_mov_b32_e32 v13, v17 -; VI-NEXT: s_branch .LBB95_5 -; VI-NEXT: .LBB95_3: -; VI-NEXT: s_branch .LBB95_2 -; VI-NEXT: .LBB95_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_mov_b32_e32 v21, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_mov_b32_e32 v19, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v17, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: .LBB95_3: ; %end +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v5, v20 +; VI-NEXT: v_mov_b32_e32 v7, v19 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_4: +; VI-NEXT: s_branch .LBB95_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB95_4 -; GFX9-NEXT: .LBB95_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0 -; GFX9-NEXT: s_branch .LBB95_5 -; GFX9-NEXT: .LBB95_3: -; GFX9-NEXT: s_branch .LBB95_2 -; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 @@ -63089,17 +63295,305 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB95_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB95_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_3 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v15, v31, v32, v15 +; GFX9-NEXT: v_and_or_b32 v14, v30, v32, v14 +; GFX9-NEXT: v_and_or_b32 v13, v29, v32, v13 +; GFX9-NEXT: v_and_or_b32 v12, v28, v32, v12 +; GFX9-NEXT: v_and_or_b32 v11, v27, v32, v11 +; GFX9-NEXT: v_and_or_b32 v10, v26, v32, v10 +; GFX9-NEXT: v_and_or_b32 v9, v25, v32, v9 +; GFX9-NEXT: v_and_or_b32 v8, v24, v32, v8 +; GFX9-NEXT: v_and_or_b32 v7, v23, v32, v7 +; GFX9-NEXT: v_and_or_b32 v6, v22, v32, v6 +; GFX9-NEXT: v_and_or_b32 v5, v21, v32, v5 +; GFX9-NEXT: v_and_or_b32 v4, v20, v32, v4 +; GFX9-NEXT: v_and_or_b32 v3, v19, v32, v3 +; GFX9-NEXT: v_and_or_b32 v2, v18, v32, v2 +; GFX9-NEXT: v_and_or_b32 v1, v17, v32, v1 +; GFX9-NEXT: v_and_or_b32 v0, v16, v32, v0 +; GFX9-NEXT: .LBB95_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: s_branch .LBB95_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -66599,32 +67093,60 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -66632,351 +67154,351 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_add_i32 s7, s17, 3 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s16, 3 -; VI-NEXT: s_add_i32 s17, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s19, 3 -; VI-NEXT: s_add_i32 s16, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s18, 3 -; VI-NEXT: s_add_i32 s19, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s21, 3 -; VI-NEXT: s_add_i32 s18, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s20, 3 -; VI-NEXT: s_add_i32 s21, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s23, 3 -; VI-NEXT: s_add_i32 s20, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s22, 3 -; VI-NEXT: s_add_i32 s23, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s25, 3 -; VI-NEXT: s_add_i32 s22, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s24, 3 -; VI-NEXT: s_add_i32 s25, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s27, 3 -; VI-NEXT: s_add_i32 s24, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s26, 3 -; VI-NEXT: s_add_i32 s27, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s29, 3 -; VI-NEXT: s_add_i32 s26, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s7, s28, 3 -; VI-NEXT: s_add_i32 s29, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_or_b32 s19, s20, s19 +; VI-NEXT: s_and_b32 s20, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_or_b32 s18, s20, s18 +; VI-NEXT: s_and_b32 s20, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_or_b32 s17, s20, s17 +; VI-NEXT: s_and_b32 s20, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s16, s20, s16 +; VI-NEXT: s_and_b32 s20, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_or_b32 s15, s20, s15 +; VI-NEXT: s_and_b32 s20, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_or_b32 s14, s20, s14 +; VI-NEXT: s_and_b32 s20, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_or_b32 s13, s20, s13 +; VI-NEXT: s_and_b32 s20, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_or_b32 s12, s20, s12 +; VI-NEXT: s_and_b32 s20, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_or_b32 s11, s20, s11 +; VI-NEXT: s_and_b32 s20, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s10, s20, s10 +; VI-NEXT: s_and_b32 s20, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s9, s20, s9 +; VI-NEXT: s_and_b32 s20, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s8, s20, s8 +; VI-NEXT: s_and_b32 s20, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_add_i32 s28, s6, 0x30000 -; VI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; VI-NEXT: s_or_b32 s7, s20, s7 +; VI-NEXT: s_and_b32 s20, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s6, s20, s6 +; VI-NEXT: s_and_b32 s20, s5, 0xffff0000 ; VI-NEXT: s_add_i32 s5, s5, 3 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: s_or_b32 s5, s20, s5 +; VI-NEXT: s_and_b32 s20, s4, 0xffff0000 ; VI-NEXT: s_add_i32 s4, s4, 3 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_or_b32 s4, s20, s4 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 -; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 ; VI-NEXT: s_lshr_b32 s59, s4, 16 ; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s29, 8 -; VI-NEXT: s_lshr_b32 s72, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 8 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: s_lshr_b32 s90, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s34, s23, 8 -; VI-NEXT: s_lshr_b32 s35, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s39, s21, 8 -; VI-NEXT: s_lshr_b32 s48, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s52, s19, 8 -; VI-NEXT: s_lshr_b32 s53, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s65, s17, 8 -; VI-NEXT: s_lshr_b32 s66, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 24 +; VI-NEXT: s_lshr_b32 s62, s7, 16 +; VI-NEXT: s_lshr_b32 s63, s7, 8 +; VI-NEXT: s_lshr_b32 s72, s6, 16 +; VI-NEXT: s_lshr_b32 s73, s6, 8 +; VI-NEXT: s_lshr_b32 s74, s9, 24 +; VI-NEXT: s_lshr_b32 s75, s9, 16 +; VI-NEXT: s_lshr_b32 s76, s9, 8 +; VI-NEXT: s_lshr_b32 s77, s8, 16 +; VI-NEXT: s_lshr_b32 s78, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s88, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s11, 8 +; VI-NEXT: s_lshr_b32 s90, s10, 16 +; VI-NEXT: s_lshr_b32 s91, s10, 8 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s34, s13, 8 +; VI-NEXT: s_lshr_b32 s35, s12, 16 +; VI-NEXT: s_lshr_b32 s36, s12, 8 +; VI-NEXT: s_lshr_b32 s37, s15, 24 +; VI-NEXT: s_lshr_b32 s38, s15, 16 +; VI-NEXT: s_lshr_b32 s39, s15, 8 +; VI-NEXT: s_lshr_b32 s48, s14, 16 +; VI-NEXT: s_lshr_b32 s49, s14, 8 +; VI-NEXT: s_lshr_b32 s50, s17, 24 +; VI-NEXT: s_lshr_b32 s51, s17, 16 +; VI-NEXT: s_lshr_b32 s52, s17, 8 +; VI-NEXT: s_lshr_b32 s53, s16, 16 +; VI-NEXT: s_lshr_b32 s54, s16, 8 +; VI-NEXT: s_lshr_b32 s55, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s66, s18, 16 +; VI-NEXT: s_lshr_b32 s67, s18, 8 ; VI-NEXT: .LBB97_3: ; %end -; VI-NEXT: s_and_b32 s7, s16, 0xff -; VI-NEXT: s_lshl_b32 s9, s67, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s66, 0xff -; VI-NEXT: s_lshl_b32 s11, s44, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_and_b32 s7, s17, 0xff -; VI-NEXT: s_lshl_b32 s9, s65, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s64, 0xff -; VI-NEXT: s_lshl_b32 s11, s55, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s18, 0xff -; VI-NEXT: s_lshl_b32 s9, s54, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_lshl_b32 s11, s42, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: s_and_b32 s21, s66, 0xff +; VI-NEXT: s_lshl_b32 s23, s44, 8 +; VI-NEXT: s_or_b32 s21, s21, s23 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_or_b32 s18, s18, s21 +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s65, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s55, 8 +; VI-NEXT: s_or_b32 s19, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s18, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_lshl_b32 s19, s42, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s18 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s19, 0xff -; VI-NEXT: s_lshl_b32 s9, s52, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s51, 0xff -; VI-NEXT: s_lshl_b32 s11, s50, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s50, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_lshl_b32 s9, s49, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s48, 0xff -; VI-NEXT: s_lshl_b32 s11, s40, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s49, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s48, 0xff +; VI-NEXT: s_lshl_b32 s17, s40, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s21, 0xff -; VI-NEXT: s_lshl_b32 s9, s39, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s38, 0xff -; VI-NEXT: s_lshl_b32 s11, s37, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: s_lshl_b32 s15, s39, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s15, s38, 0xff +; VI-NEXT: s_lshl_b32 s16, s37, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s22, 0xff -; VI-NEXT: s_lshl_b32 s9, s36, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s35, 0xff -; VI-NEXT: s_lshl_b32 s11, s14, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s36, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s14, s35, 0xff +; VI-NEXT: s_lshl_b32 s15, s28, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s14 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s23, 0xff -; VI-NEXT: s_lshl_b32 s9, s34, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s31, 0xff -; VI-NEXT: s_lshl_b32 s11, s30, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: s_lshl_b32 s13, s34, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s24, 0xff -; VI-NEXT: s_lshl_b32 s9, s91, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s90, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s91, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: s_and_b32 s12, s90, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s25, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s88, 0xff -; VI-NEXT: s_lshl_b32 s11, s79, 8 -; VI-NEXT: s_or_b32 s9, s9, s11 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: s_lshl_b32 s11, s89, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s88, 0xff +; VI-NEXT: s_lshl_b32 s12, s79, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s9, s78, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s77, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s78, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s77, 0xff +; VI-NEXT: s_lshl_b32 s11, s24, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff ; VI-NEXT: s_lshl_b32 s9, s76, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s9, s75, 0xff ; VI-NEXT: s_lshl_b32 s10, s74, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s28, 0xff -; VI-NEXT: s_lshl_b32 s9, s73, 8 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_and_b32 s9, s72, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s72, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_and_b32 s7, s29, 0xff -; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s8, s61, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s8, s62, 0xff -; VI-NEXT: s_lshl_b32 s9, s61, 8 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s7, s60, 8 -; VI-NEXT: s_or_b32 s4, s4, s7 -; VI-NEXT: s_and_b32 s7, s59, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s59, 0xff +; VI-NEXT: s_lshl_b32 s7, s20, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 @@ -66997,28 +67519,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -67043,31 +67565,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 @@ -67094,10 +67616,38 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -67121,76 +67671,76 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB97_4 ; GFX9-NEXT: .LBB97_2: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v6, s27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -67240,31 +67790,31 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: .LBB97_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -67293,20 +67843,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -67349,15 +67899,15 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB97_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -75069,670 +75619,340 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s31, v1 -; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_cbranch_scc0 .LBB103_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: s_cbranch_execnz .LBB103_3 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v16 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v16 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_add_f32_e32 v5, s7, v16 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v3, s6, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v3, s5, v16 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s5, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s5, v16 -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v16 -; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_add_f32_e32 v7, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v9, s5, v16 -; VI-NEXT: v_mov_b32_e32 v5, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_add_f32_e32 v9, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v11, s5, v16 -; VI-NEXT: v_mov_b32_e32 v7, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_add_f32_e32 v11, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 -; VI-NEXT: v_mov_b32_e32 v9, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v13, s5, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_mov_b32_e32 v21, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_add_f32_e32 v15, s4, v16 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_mov_b32_e32 v13, v17 -; VI-NEXT: s_branch .LBB103_5 -; VI-NEXT: .LBB103_3: -; VI-NEXT: s_branch .LBB103_2 -; VI-NEXT: .LBB103_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: v_mov_b32_e32 v14, s30 -; VI-NEXT: v_mov_b32_e32 v15, s31 -; VI-NEXT: .LBB103_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v20, 1 -; VI-NEXT: v_readlane_b32 s30, v20, 0 -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_mov_b32_e32 v19, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v17, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: .LBB103_3: ; %end +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v5, v20 +; VI-NEXT: v_mov_b32_e32 v7, v19 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_4: +; VI-NEXT: s_branch .LBB103_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v20, s30, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_writelane_b32 v20, s31, 1 -; GFX9-NEXT: v_readfirstlane_b32 s30, v0 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s31, v1 -; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB103_4 -; GFX9-NEXT: .LBB103_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s5, s30, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s5, s31, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff -; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s29, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s28, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s27, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s26, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s25, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s24, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 -; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 -; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 -; GFX9-NEXT: s_branch .LBB103_5 -; GFX9-NEXT: .LBB103_3: -; GFX9-NEXT: s_branch .LBB103_2 -; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 @@ -75746,17 +75966,321 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: v_mov_b32_e32 v14, s30 -; GFX9-NEXT: v_mov_b32_e32 v15, s31 -; GFX9-NEXT: .LBB103_5: ; %end -; GFX9-NEXT: v_readlane_b32 s31, v20, 1 -; GFX9-NEXT: v_readlane_b32 s30, v20, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB103_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_3 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v1 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v19, v19, v2 +; GFX9-NEXT: v_add_u32_e32 v19, 0x7fff, v19 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v20, v20, v3 +; GFX9-NEXT: v_add_u32_e32 v20, 0x7fff, v20 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v21, v21, v4 +; GFX9-NEXT: v_add_u32_e32 v21, 0x7fff, v21 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v22, v22, v5 +; GFX9-NEXT: v_add_u32_e32 v22, 0x7fff, v22 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v23, v23, v6 +; GFX9-NEXT: v_add_u32_e32 v23, 0x7fff, v23 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v24, v24, v7 +; GFX9-NEXT: v_add_u32_e32 v24, 0x7fff, v24 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v25, v25, v8 +; GFX9-NEXT: v_add_u32_e32 v25, 0x7fff, v25 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v26, v26, v9 +; GFX9-NEXT: v_add_u32_e32 v26, 0x7fff, v26 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v27, v27, v10 +; GFX9-NEXT: v_add_u32_e32 v27, 0x7fff, v27 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v28, v28, v11 +; GFX9-NEXT: v_add_u32_e32 v28, 0x7fff, v28 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v29, v29, v12 +; GFX9-NEXT: v_add_u32_e32 v29, 0x7fff, v29 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v30, v30, v13 +; GFX9-NEXT: v_add_u32_e32 v30, 0x7fff, v30 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v31, v31, v14 +; GFX9-NEXT: v_add_u32_e32 v31, 0x7fff, v31 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v32, v32, v15 +; GFX9-NEXT: v_add_u32_e32 v32, 0x7fff, v32 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: v_mov_b32_e32 v32, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_and_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_and_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_and_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_and_b32_sdwa v12, v32, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_and_b32_sdwa v11, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_and_b32_sdwa v10, v32, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_and_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_and_b32_sdwa v8, v32, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_and_b32_sdwa v7, v32, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_and_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_and_b32_sdwa v5, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_and_b32_sdwa v4, v32, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_and_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_and_b32_sdwa v2, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v31, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v14, v30, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v29, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v28, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v27, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v26, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: .LBB103_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: s_branch .LBB103_2 ; ; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -79087,7 +79611,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -79108,10 +79632,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_writelane_b32 v63, s64, 16 ; VI-NEXT: v_writelane_b32 v63, s65, 17 ; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -79130,232 +79682,230 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB105_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s75, s5, 24 +; VI-NEXT: s_lshr_b32 s72, s5, 24 ; VI-NEXT: s_lshr_b32 s36, s5, 16 -; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s56, s5, 8 ; VI-NEXT: s_lshr_b32 s37, s4, 16 -; VI-NEXT: s_lshr_b32 s56, s4, 8 -; VI-NEXT: s_lshr_b32 s77, s29, 24 -; VI-NEXT: s_lshr_b32 s38, s29, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 8 -; VI-NEXT: s_lshr_b32 s39, s28, 16 -; VI-NEXT: s_lshr_b32 s57, s28, 8 -; VI-NEXT: s_lshr_b32 s79, s27, 24 -; VI-NEXT: s_lshr_b32 s48, s27, 16 -; VI-NEXT: s_lshr_b32 s74, s27, 8 -; VI-NEXT: s_lshr_b32 s49, s26, 16 -; VI-NEXT: s_lshr_b32 s59, s26, 8 -; VI-NEXT: s_lshr_b32 s89, s25, 24 -; VI-NEXT: s_lshr_b32 s50, s25, 16 -; VI-NEXT: s_lshr_b32 s76, s25, 8 -; VI-NEXT: s_lshr_b32 s51, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s24, 8 -; VI-NEXT: s_lshr_b32 s91, s23, 24 -; VI-NEXT: s_lshr_b32 s52, s23, 16 -; VI-NEXT: s_lshr_b32 s78, s23, 8 -; VI-NEXT: s_lshr_b32 s53, s22, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 8 -; VI-NEXT: s_lshr_b32 s31, s21, 24 -; VI-NEXT: s_lshr_b32 s54, s21, 16 -; VI-NEXT: s_lshr_b32 s88, s21, 8 -; VI-NEXT: s_lshr_b32 s55, s20, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 8 -; VI-NEXT: s_lshr_b32 s34, s19, 24 -; VI-NEXT: s_lshr_b32 s64, s19, 16 -; VI-NEXT: s_lshr_b32 s90, s19, 8 -; VI-NEXT: s_lshr_b32 s65, s18, 16 -; VI-NEXT: s_lshr_b32 s72, s18, 8 -; VI-NEXT: s_lshr_b32 s35, s17, 24 -; VI-NEXT: s_lshr_b32 s66, s17, 16 -; VI-NEXT: s_lshr_b32 s30, s17, 8 -; VI-NEXT: s_lshr_b32 s67, s16, 16 -; VI-NEXT: s_lshr_b32 s73, s16, 8 +; VI-NEXT: s_lshr_b32 s57, s4, 8 +; VI-NEXT: s_lshr_b32 s75, s7, 24 +; VI-NEXT: s_lshr_b32 s38, s7, 16 +; VI-NEXT: s_lshr_b32 s58, s7, 8 +; VI-NEXT: s_lshr_b32 s39, s6, 16 +; VI-NEXT: s_lshr_b32 s59, s6, 8 +; VI-NEXT: s_lshr_b32 s77, s9, 24 +; VI-NEXT: s_lshr_b32 s48, s9, 16 +; VI-NEXT: s_lshr_b32 s60, s9, 8 +; VI-NEXT: s_lshr_b32 s49, s8, 16 +; VI-NEXT: s_lshr_b32 s61, s8, 8 +; VI-NEXT: s_lshr_b32 s79, s11, 24 +; VI-NEXT: s_lshr_b32 s50, s11, 16 +; VI-NEXT: s_lshr_b32 s62, s11, 8 +; VI-NEXT: s_lshr_b32 s51, s10, 16 +; VI-NEXT: s_lshr_b32 s63, s10, 8 +; VI-NEXT: s_lshr_b32 s90, s13, 24 +; VI-NEXT: s_lshr_b32 s52, s13, 16 +; VI-NEXT: s_lshr_b32 s73, s13, 8 +; VI-NEXT: s_lshr_b32 s53, s12, 16 +; VI-NEXT: s_lshr_b32 s74, s12, 8 +; VI-NEXT: s_lshr_b32 s31, s15, 24 +; VI-NEXT: s_lshr_b32 s54, s15, 16 +; VI-NEXT: s_lshr_b32 s76, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s78, s14, 8 +; VI-NEXT: s_lshr_b32 s34, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s88, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s89, s16, 8 +; VI-NEXT: s_lshr_b32 s35, s19, 24 +; VI-NEXT: s_lshr_b32 s66, s19, 16 +; VI-NEXT: s_lshr_b32 s91, s19, 8 +; VI-NEXT: s_lshr_b32 s67, s18, 16 +; VI-NEXT: s_lshr_b32 s30, s18, 8 ; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB105_4 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: s_lshr_b32 s20, s19, 16 ; VI-NEXT: v_mov_b32_e32 v1, 0x200 -; VI-NEXT: v_add_f16_e32 v12, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_add_f16_e32 v12, s20, v1 +; VI-NEXT: v_add_f16_e32 v27, s19, v1 +; VI-NEXT: s_lshr_b32 s19, s18, 16 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: v_add_f16_e32 v27, s17, v1 -; VI-NEXT: v_add_f16_e32 v19, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_add_f16_e32 v19, s19, v1 +; VI-NEXT: v_add_f16_e32 v35, s18, v1 +; VI-NEXT: s_lshr_b32 s18, s17, 16 ; VI-NEXT: v_or_b32_e32 v10, v27, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; VI-NEXT: v_add_f16_e32 v35, s16, v1 -; VI-NEXT: v_add_f16_e32 v13, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_add_f16_e32 v13, s18, v1 +; VI-NEXT: v_add_f16_e32 v28, s17, v1 +; VI-NEXT: s_lshr_b32 s17, s16, 16 ; VI-NEXT: v_or_b32_e32 v9, v35, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; VI-NEXT: v_add_f16_e32 v28, s19, v1 -; VI-NEXT: v_add_f16_e32 v20, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: v_or_b32_e32 v62, v28, v2 +; VI-NEXT: v_add_f16_e32 v20, s17, v1 +; VI-NEXT: v_add_f16_e32 v36, s16, v1 +; VI-NEXT: s_lshr_b32 s16, s15, 16 +; VI-NEXT: v_or_b32_e32 v52, v28, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; VI-NEXT: v_add_f16_e32 v36, s18, v1 -; VI-NEXT: v_add_f16_e32 v14, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s20, 16 -; VI-NEXT: v_or_b32_e32 v61, v36, v2 +; VI-NEXT: v_add_f16_e32 v14, s16, v1 +; VI-NEXT: v_add_f16_e32 v29, s15, v1 +; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: v_or_b32_e32 v51, v36, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_add_f16_e32 v29, s21, v1 -; VI-NEXT: v_add_f16_e32 v21, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_add_f16_e32 v21, s15, v1 +; VI-NEXT: v_add_f16_e32 v37, s14, v1 +; VI-NEXT: s_lshr_b32 s14, s13, 16 ; VI-NEXT: v_or_b32_e32 v8, v29, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; VI-NEXT: v_add_f16_e32 v37, s20, v1 -; VI-NEXT: v_add_f16_e32 v15, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_add_f16_e32 v15, s14, v1 +; VI-NEXT: v_add_f16_e32 v30, s13, v1 +; VI-NEXT: s_lshr_b32 s13, s12, 16 ; VI-NEXT: v_or_b32_e32 v7, v37, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_add_f16_e32 v30, s23, v1 -; VI-NEXT: v_add_f16_e32 v22, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: v_or_b32_e32 v47, v30, v2 +; VI-NEXT: v_add_f16_e32 v22, s13, v1 +; VI-NEXT: v_add_f16_e32 v38, s12, v1 +; VI-NEXT: s_lshr_b32 s12, s11, 16 +; VI-NEXT: v_or_b32_e32 v57, v30, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; VI-NEXT: v_add_f16_e32 v38, s22, v1 -; VI-NEXT: v_add_f16_e32 v16, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s24, 16 -; VI-NEXT: v_or_b32_e32 v46, v38, v2 +; VI-NEXT: v_add_f16_e32 v16, s12, v1 +; VI-NEXT: v_add_f16_e32 v31, s11, v1 +; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: v_or_b32_e32 v56, v38, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; VI-NEXT: v_add_f16_e32 v31, s25, v1 -; VI-NEXT: v_add_f16_e32 v23, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_add_f16_e32 v23, s11, v1 +; VI-NEXT: v_add_f16_e32 v39, s10, v1 +; VI-NEXT: s_lshr_b32 s10, s9, 16 ; VI-NEXT: v_or_b32_e32 v6, v31, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; VI-NEXT: v_add_f16_e32 v39, s24, v1 -; VI-NEXT: v_add_f16_e32 v17, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_add_f16_e32 v17, s10, v1 +; VI-NEXT: v_add_f16_e32 v32, s9, v1 +; VI-NEXT: s_lshr_b32 s9, s8, 16 ; VI-NEXT: v_or_b32_e32 v5, v39, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; VI-NEXT: v_add_f16_e32 v32, s27, v1 -; VI-NEXT: v_add_f16_e32 v24, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_add_f16_e32 v24, s9, v1 +; VI-NEXT: v_add_f16_e32 v48, s8, v1 +; VI-NEXT: s_lshr_b32 s8, s7, 16 ; VI-NEXT: v_or_b32_e32 v43, v32, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; VI-NEXT: v_add_f16_e32 v48, s26, v1 -; VI-NEXT: v_add_f16_e32 v18, s6, v1 -; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_add_f16_e32 v18, s8, v1 +; VI-NEXT: v_add_f16_e32 v33, s7, v1 +; VI-NEXT: s_lshr_b32 s7, s6, 16 ; VI-NEXT: v_or_b32_e32 v42, v48, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_add_f16_e32 v33, s29, v1 -; VI-NEXT: v_add_f16_e32 v25, s6, v1 +; VI-NEXT: v_add_f16_e32 v25, s7, v1 +; VI-NEXT: v_add_f16_e32 v49, s6, v1 ; VI-NEXT: s_lshr_b32 s6, s5, 16 ; VI-NEXT: v_or_b32_e32 v55, v33, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; VI-NEXT: v_add_f16_e32 v49, s28, v1 ; VI-NEXT: v_add_f16_e32 v11, s6, v1 ; VI-NEXT: v_add_f16_e32 v34, s5, v1 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_or_b32_e32 v54, v49, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 ; VI-NEXT: v_add_f16_e32 v26, s5, v1 -; VI-NEXT: v_or_b32_e32 v52, v34, v2 +; VI-NEXT: v_or_b32_e32 v41, v34, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; VI-NEXT: v_add_f16_e32 v50, s4, v1 -; VI-NEXT: v_or_b32_e32 v51, v50, v2 -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[51:52] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v40, v50, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[40:41] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[54:55] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] ; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 ; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42 ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[56:57] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[61:62] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v56 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[51:52] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10] -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v55 ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v46 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v61 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v57 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; VI-NEXT: v_bfe_u32 v9, v11, 8, 8 ; VI-NEXT: v_bfe_u32 v10, v18, 8, 8 ; VI-NEXT: v_bfe_u32 v40, v17, 8, 8 ; VI-NEXT: v_bfe_u32 v43, v16, 8, 8 ; VI-NEXT: v_bfe_u32 v46, v15, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v59, v13, 8, 8 +; VI-NEXT: v_bfe_u32 v60, v13, 8, 8 ; VI-NEXT: v_bfe_u32 v62, v12, 8, 8 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr35 -; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr31 -; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v53, s56 -; VI-NEXT: v_mov_b32_e32 v52, s42 -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s56 ; VI-NEXT: v_mov_b32_e32 v52, s44 -; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 @@ -79372,96 +79922,96 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v18, s38 ; VI-NEXT: v_mov_b32_e32 v26, s37 ; VI-NEXT: v_mov_b32_e32 v11, s36 -; VI-NEXT: v_mov_b32_e32 v35, s16 -; VI-NEXT: v_mov_b32_e32 v27, s17 -; VI-NEXT: v_mov_b32_e32 v36, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 -; VI-NEXT: v_mov_b32_e32 v37, s20 -; VI-NEXT: v_mov_b32_e32 v29, s21 -; VI-NEXT: v_mov_b32_e32 v38, s22 -; VI-NEXT: v_mov_b32_e32 v30, s23 -; VI-NEXT: v_mov_b32_e32 v39, s24 -; VI-NEXT: v_mov_b32_e32 v31, s25 -; VI-NEXT: v_mov_b32_e32 v48, s26 -; VI-NEXT: v_mov_b32_e32 v32, s27 -; VI-NEXT: v_mov_b32_e32 v49, s28 -; VI-NEXT: v_mov_b32_e32 v33, s29 +; VI-NEXT: v_mov_b32_e32 v35, s18 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v36, s16 +; VI-NEXT: v_mov_b32_e32 v28, s17 +; VI-NEXT: v_mov_b32_e32 v37, s14 +; VI-NEXT: v_mov_b32_e32 v29, s15 +; VI-NEXT: v_mov_b32_e32 v38, s12 +; VI-NEXT: v_mov_b32_e32 v30, s13 +; VI-NEXT: v_mov_b32_e32 v39, s10 +; VI-NEXT: v_mov_b32_e32 v31, s11 +; VI-NEXT: v_mov_b32_e32 v48, s8 +; VI-NEXT: v_mov_b32_e32 v32, s9 +; VI-NEXT: v_mov_b32_e32 v49, s6 +; VI-NEXT: v_mov_b32_e32 v33, s7 ; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_mov_b32_e32 v34, s5 ; VI-NEXT: v_mov_b32_e32 v62, s35 -; VI-NEXT: v_mov_b32_e32 v59, s34 +; VI-NEXT: v_mov_b32_e32 v60, s34 ; VI-NEXT: v_mov_b32_e32 v57, s31 -; VI-NEXT: v_mov_b32_e32 v46, s91 -; VI-NEXT: v_mov_b32_e32 v43, s89 -; VI-NEXT: v_mov_b32_e32 v40, s79 -; VI-NEXT: v_mov_b32_e32 v10, s77 -; VI-NEXT: v_mov_b32_e32 v61, s30 -; VI-NEXT: v_mov_b32_e32 v58, s90 -; VI-NEXT: v_mov_b32_e32 v47, s88 -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v42, s76 -; VI-NEXT: v_mov_b32_e32 v55, s74 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v54, s57 -; VI-NEXT: v_mov_b32_e32 v41, s59 -; VI-NEXT: v_mov_b32_e32 v44, s60 -; VI-NEXT: v_mov_b32_e32 v56, s61 -; VI-NEXT: v_mov_b32_e32 v60, s63 -; VI-NEXT: v_mov_b32_e32 v51, s72 -; VI-NEXT: v_mov_b32_e32 v1, s73 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: v_mov_b32_e32 v7, s8 -; VI-NEXT: v_mov_b32_e32 v6, s10 -; VI-NEXT: v_mov_b32_e32 v5, s12 -; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v46, s90 +; VI-NEXT: v_mov_b32_e32 v43, s79 +; VI-NEXT: v_mov_b32_e32 v40, s77 +; VI-NEXT: v_mov_b32_e32 v10, s75 +; VI-NEXT: v_mov_b32_e32 v9, s72 +; VI-NEXT: v_mov_b32_e32 v51, s30 +; VI-NEXT: v_mov_b32_e32 v1, s91 +; VI-NEXT: v_mov_b32_e32 v61, s89 +; VI-NEXT: v_mov_b32_e32 v59, s88 +; VI-NEXT: v_mov_b32_e32 v58, s78 +; VI-NEXT: v_mov_b32_e32 v56, s76 +; VI-NEXT: v_mov_b32_e32 v47, s74 +; VI-NEXT: v_mov_b32_e32 v45, s73 +; VI-NEXT: v_mov_b32_e32 v44, s63 +; VI-NEXT: v_mov_b32_e32 v42, s62 +; VI-NEXT: v_mov_b32_e32 v41, s61 +; VI-NEXT: v_mov_b32_e32 v55, s60 +; VI-NEXT: v_mov_b32_e32 v54, s59 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v6, s24 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v4, s28 ; VI-NEXT: v_mov_b32_e32 v3, s40 -; VI-NEXT: v_mov_b32_e32 v9, s75 -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v52, s62 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: .LBB105_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v51 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v8, v35, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v60 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79503,10 +80053,23 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -79527,30 +80090,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 @@ -79575,7 +80124,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -79601,10 +80150,38 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_writelane_b32 v63, s52, 12 ; GFX9-NEXT: v_writelane_b32 v63, s53, 13 ; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -79628,66 +80205,66 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_lshr_b32 s59, s5, 8 ; GFX9-NEXT: s_lshr_b32 s58, s4, 16 ; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s29, 24 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s72, s29, 8 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s73, s28, 8 -; GFX9-NEXT: s_lshr_b32 s74, s27, 24 -; GFX9-NEXT: s_lshr_b32 s75, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s27, 8 -; GFX9-NEXT: s_lshr_b32 s76, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s26, 8 -; GFX9-NEXT: s_lshr_b32 s79, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 8 -; GFX9-NEXT: s_lshr_b32 s89, s24, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 8 -; GFX9-NEXT: s_lshr_b32 s92, s23, 24 -; GFX9-NEXT: s_lshr_b32 s93, s23, 16 -; GFX9-NEXT: s_lshr_b32 s95, s23, 8 -; GFX9-NEXT: s_lshr_b32 s94, s22, 16 -; GFX9-NEXT: s_lshr_b32 s30, s22, 8 -; GFX9-NEXT: s_lshr_b32 s31, s21, 24 -; GFX9-NEXT: s_lshr_b32 s34, s21, 16 -; GFX9-NEXT: s_lshr_b32 s36, s21, 8 -; GFX9-NEXT: s_lshr_b32 s35, s20, 16 -; GFX9-NEXT: s_lshr_b32 s37, s20, 8 -; GFX9-NEXT: s_lshr_b32 s38, s19, 24 -; GFX9-NEXT: s_lshr_b32 s39, s19, 16 -; GFX9-NEXT: s_lshr_b32 s49, s19, 8 -; GFX9-NEXT: s_lshr_b32 s48, s18, 16 -; GFX9-NEXT: s_lshr_b32 s50, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s17, 24 -; GFX9-NEXT: s_lshr_b32 s52, s17, 16 -; GFX9-NEXT: s_lshr_b32 s54, s17, 8 -; GFX9-NEXT: s_lshr_b32 s53, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b32 s61, s7, 24 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 8 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 8 +; GFX9-NEXT: s_lshr_b32 s74, s9, 24 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s9, 8 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s8, 8 +; GFX9-NEXT: s_lshr_b32 s79, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 8 +; GFX9-NEXT: s_lshr_b32 s89, s10, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 8 +; GFX9-NEXT: s_lshr_b32 s92, s13, 24 +; GFX9-NEXT: s_lshr_b32 s93, s13, 16 +; GFX9-NEXT: s_lshr_b32 s95, s13, 8 +; GFX9-NEXT: s_lshr_b32 s94, s12, 16 +; GFX9-NEXT: s_lshr_b32 s30, s12, 8 +; GFX9-NEXT: s_lshr_b32 s31, s15, 24 +; GFX9-NEXT: s_lshr_b32 s34, s15, 16 +; GFX9-NEXT: s_lshr_b32 s36, s15, 8 +; GFX9-NEXT: s_lshr_b32 s35, s14, 16 +; GFX9-NEXT: s_lshr_b32 s37, s14, 8 +; GFX9-NEXT: s_lshr_b32 s38, s17, 24 +; GFX9-NEXT: s_lshr_b32 s39, s17, 16 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s48, s16, 16 +; GFX9-NEXT: s_lshr_b32 s50, s16, 8 +; GFX9-NEXT: s_lshr_b32 s51, s19, 24 +; GFX9-NEXT: s_lshr_b32 s52, s19, 16 +; GFX9-NEXT: s_lshr_b32 s54, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 ; GFX9-NEXT: .LBB105_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 -; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, s22, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, s25, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, s24, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, s27, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, s26, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, s29, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s15, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s14, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s13, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s12, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s11, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s10, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s9, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s8, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s7, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s6, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] @@ -79748,31 +80325,31 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: .LBB105_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr37 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr31 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr89 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -79801,20 +80378,20 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v11, s20 -; GFX9-NEXT: v_mov_b32_e32 v12, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s23 -; GFX9-NEXT: v_mov_b32_e32 v7, s24 -; GFX9-NEXT: v_mov_b32_e32 v8, s25 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v17, s55 @@ -79857,15 +80434,15 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 ; GFX9-NEXT: v_mov_b32_e32 v14, s57 ; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s12 -; GFX9-NEXT: v_mov_b32_e32 v24, s10 -; GFX9-NEXT: v_mov_b32_e32 v25, s8 -; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s20 ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: .LBB105_5: ; %end ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -88432,657 +89009,685 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v4, s30, 0 -; VI-NEXT: v_writelane_b32 v4, s31, 1 -; VI-NEXT: v_writelane_b32 v4, s34, 2 -; VI-NEXT: v_writelane_b32 v4, s35, 3 -; VI-NEXT: v_writelane_b32 v4, s36, 4 -; VI-NEXT: v_writelane_b32 v4, s37, 5 -; VI-NEXT: v_writelane_b32 v4, s38, 6 -; VI-NEXT: v_writelane_b32 v4, s39, 7 -; VI-NEXT: v_writelane_b32 v4, s48, 8 -; VI-NEXT: v_writelane_b32 v4, s49, 9 -; VI-NEXT: v_writelane_b32 v4, s50, 10 -; VI-NEXT: v_writelane_b32 v4, s51, 11 -; VI-NEXT: v_writelane_b32 v4, s52, 12 -; VI-NEXT: v_writelane_b32 v4, s53, 13 -; VI-NEXT: v_writelane_b32 v4, s54, 14 -; VI-NEXT: v_writelane_b32 v4, s55, 15 -; VI-NEXT: v_writelane_b32 v4, s64, 16 -; VI-NEXT: v_writelane_b32 v4, s65, 17 -; VI-NEXT: v_writelane_b32 v4, s66, 18 -; VI-NEXT: v_writelane_b32 v4, s67, 19 -; VI-NEXT: v_writelane_b32 v4, s68, 20 -; VI-NEXT: v_writelane_b32 v4, s69, 21 -; VI-NEXT: v_writelane_b32 v4, s70, 22 -; VI-NEXT: v_writelane_b32 v4, s71, 23 -; VI-NEXT: v_writelane_b32 v4, s80, 24 -; VI-NEXT: v_writelane_b32 v4, s81, 25 +; VI-NEXT: v_writelane_b32 v18, s30, 0 +; VI-NEXT: v_writelane_b32 v18, s31, 1 +; VI-NEXT: v_writelane_b32 v18, s34, 2 +; VI-NEXT: v_writelane_b32 v18, s35, 3 +; VI-NEXT: v_writelane_b32 v18, s36, 4 +; VI-NEXT: v_writelane_b32 v18, s37, 5 +; VI-NEXT: v_writelane_b32 v18, s38, 6 +; VI-NEXT: v_writelane_b32 v18, s39, 7 +; VI-NEXT: v_writelane_b32 v18, s48, 8 +; VI-NEXT: v_writelane_b32 v18, s49, 9 +; VI-NEXT: v_writelane_b32 v18, s50, 10 +; VI-NEXT: v_writelane_b32 v18, s51, 11 +; VI-NEXT: v_writelane_b32 v18, s52, 12 +; VI-NEXT: v_writelane_b32 v18, s53, 13 +; VI-NEXT: v_writelane_b32 v18, s54, 14 +; VI-NEXT: v_writelane_b32 v18, s55, 15 +; VI-NEXT: v_writelane_b32 v18, s64, 16 +; VI-NEXT: v_writelane_b32 v18, s65, 17 +; VI-NEXT: v_writelane_b32 v18, s66, 18 +; VI-NEXT: v_writelane_b32 v18, s67, 19 +; VI-NEXT: v_writelane_b32 v18, s68, 20 +; VI-NEXT: v_writelane_b32 v18, s69, 21 +; VI-NEXT: v_writelane_b32 v18, s70, 22 +; VI-NEXT: v_writelane_b32 v18, s71, 23 +; VI-NEXT: v_writelane_b32 v18, s80, 24 +; VI-NEXT: v_writelane_b32 v18, s81, 25 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v4, s82, 26 +; VI-NEXT: v_writelane_b32 v18, s82, 26 +; VI-NEXT: v_readfirstlane_b32 s18, v4 +; VI-NEXT: v_readfirstlane_b32 s19, v5 +; VI-NEXT: v_readfirstlane_b32 s16, v6 +; VI-NEXT: v_readfirstlane_b32 s17, v7 +; VI-NEXT: v_readfirstlane_b32 s14, v8 +; VI-NEXT: v_readfirstlane_b32 s15, v9 +; VI-NEXT: v_readfirstlane_b32 s12, v10 +; VI-NEXT: v_readfirstlane_b32 s13, v11 +; VI-NEXT: v_readfirstlane_b32 s10, v12 +; VI-NEXT: v_readfirstlane_b32 s11, v13 +; VI-NEXT: v_readfirstlane_b32 s8, v14 +; VI-NEXT: v_readfirstlane_b32 s9, v15 +; VI-NEXT: v_readfirstlane_b32 s6, v16 +; VI-NEXT: v_readfirstlane_b32 s7, v17 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v4, s83, 27 +; VI-NEXT: v_writelane_b32 v18, s83, 27 ; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s5, 24 -; VI-NEXT: s_lshr_b32 s9, s5, 16 -; VI-NEXT: s_lshr_b32 s11, s5, 8 -; VI-NEXT: s_lshr_b32 s13, s4, 16 -; VI-NEXT: s_lshr_b32 s15, s4, 8 -; VI-NEXT: s_lshr_b32 s41, s29, 24 -; VI-NEXT: s_lshr_b32 s47, s29, 16 -; VI-NEXT: s_lshr_b32 s57, s29, 8 -; VI-NEXT: s_lshr_b32 s88, s28, 16 -; VI-NEXT: s_lshr_b32 s89, s28, 8 -; VI-NEXT: s_lshr_b32 s90, s27, 24 -; VI-NEXT: s_lshr_b32 s91, s27, 16 -; VI-NEXT: s_lshr_b32 s30, s27, 8 -; VI-NEXT: s_lshr_b32 s31, s26, 16 -; VI-NEXT: s_lshr_b32 s34, s26, 8 -; VI-NEXT: s_lshr_b32 s35, s25, 24 -; VI-NEXT: s_lshr_b32 s36, s25, 16 -; VI-NEXT: s_lshr_b32 s37, s25, 8 -; VI-NEXT: s_lshr_b32 s38, s24, 16 -; VI-NEXT: s_lshr_b32 s39, s24, 8 -; VI-NEXT: s_lshr_b32 s48, s23, 24 -; VI-NEXT: s_lshr_b32 s49, s23, 16 -; VI-NEXT: s_lshr_b32 s50, s23, 8 -; VI-NEXT: s_lshr_b32 s51, s22, 16 -; VI-NEXT: s_lshr_b32 s52, s22, 8 -; VI-NEXT: s_lshr_b32 s53, s21, 24 -; VI-NEXT: s_lshr_b32 s54, s21, 16 -; VI-NEXT: s_lshr_b32 s55, s21, 8 -; VI-NEXT: s_lshr_b32 s64, s20, 16 -; VI-NEXT: s_lshr_b32 s65, s20, 8 -; VI-NEXT: s_lshr_b32 s66, s19, 24 -; VI-NEXT: s_lshr_b32 s67, s19, 16 -; VI-NEXT: s_lshr_b32 s68, s19, 8 -; VI-NEXT: s_lshr_b32 s69, s18, 16 -; VI-NEXT: s_lshr_b32 s70, s18, 8 -; VI-NEXT: s_lshr_b32 s71, s17, 24 -; VI-NEXT: s_lshr_b32 s80, s17, 16 -; VI-NEXT: s_lshr_b32 s81, s17, 8 -; VI-NEXT: s_lshr_b32 s82, s16, 16 -; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b32 s21, s5, 24 +; VI-NEXT: s_lshr_b32 s23, s5, 16 +; VI-NEXT: s_lshr_b32 s25, s5, 8 +; VI-NEXT: s_lshr_b32 s27, s4, 16 +; VI-NEXT: s_lshr_b32 s29, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s7, 24 +; VI-NEXT: s_lshr_b32 s47, s7, 16 +; VI-NEXT: s_lshr_b32 s57, s7, 8 +; VI-NEXT: s_lshr_b32 s88, s6, 16 +; VI-NEXT: s_lshr_b32 s89, s6, 8 +; VI-NEXT: s_lshr_b32 s90, s9, 24 +; VI-NEXT: s_lshr_b32 s91, s9, 16 +; VI-NEXT: s_lshr_b32 s30, s9, 8 +; VI-NEXT: s_lshr_b32 s31, s8, 16 +; VI-NEXT: s_lshr_b32 s34, s8, 8 +; VI-NEXT: s_lshr_b32 s35, s11, 24 +; VI-NEXT: s_lshr_b32 s36, s11, 16 +; VI-NEXT: s_lshr_b32 s37, s11, 8 +; VI-NEXT: s_lshr_b32 s38, s10, 16 +; VI-NEXT: s_lshr_b32 s39, s10, 8 +; VI-NEXT: s_lshr_b32 s48, s13, 24 +; VI-NEXT: s_lshr_b32 s49, s13, 16 +; VI-NEXT: s_lshr_b32 s50, s13, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s53, s15, 24 +; VI-NEXT: s_lshr_b32 s54, s15, 16 +; VI-NEXT: s_lshr_b32 s55, s15, 8 +; VI-NEXT: s_lshr_b32 s64, s14, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 8 +; VI-NEXT: s_lshr_b32 s66, s17, 24 +; VI-NEXT: s_lshr_b32 s67, s17, 16 +; VI-NEXT: s_lshr_b32 s68, s17, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s71, s19, 24 +; VI-NEXT: s_lshr_b32 s80, s19, 16 +; VI-NEXT: s_lshr_b32 s81, s19, 8 +; VI-NEXT: s_lshr_b32 s82, s18, 16 +; VI-NEXT: s_lshr_b32 s83, s18, 8 ; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_mov_b32 s6, s17 -; VI-NEXT: s_mov_b32 s8, s19 -; VI-NEXT: s_mov_b32 s10, s21 -; VI-NEXT: s_mov_b32 s12, s23 -; VI-NEXT: s_mov_b32 s14, s25 -; VI-NEXT: s_mov_b32 s40, s27 -; VI-NEXT: s_mov_b32 s46, s29 +; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_mov_b32 s20, s19 +; VI-NEXT: s_mov_b32 s22, s17 +; VI-NEXT: s_mov_b32 s24, s15 +; VI-NEXT: s_mov_b32 s26, s13 +; VI-NEXT: s_mov_b32 s28, s11 +; VI-NEXT: s_mov_b32 s40, s9 +; VI-NEXT: s_mov_b32 s46, s7 ; VI-NEXT: s_mov_b32 s56, s5 ; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: s_lshl_b32 s20, s19, 16 ; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v2, s6, v1 -; VI-NEXT: v_readfirstlane_b32 s6, v2 -; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s6 -; VI-NEXT: s_add_i32 s8, s7, 0x7fff -; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_add_f32_e32 v2, s20, v1 +; VI-NEXT: v_readfirstlane_b32 s20, v2 +; VI-NEXT: s_bfe_u32 s21, s20, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s20 +; VI-NEXT: s_add_i32 s22, s21, 0x7fff +; VI-NEXT: s_or_b32 s23, s20, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[6:7], vcc, exec -; VI-NEXT: s_cselect_b32 s6, s9, s8 -; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[20:21], vcc, exec +; VI-NEXT: s_cselect_b32 s20, s23, s22 +; VI-NEXT: s_and_b32 s19, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s19 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s10 -; VI-NEXT: s_lshr_b32 s7, s7, 16 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 -; VI-NEXT: s_lshl_b32 s7, s16, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s19, s19, s21 +; VI-NEXT: s_lshr_b32 s21, s19, 16 +; VI-NEXT: s_lshl_b32 s19, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s19, v1 +; VI-NEXT: s_lshr_b64 s[20:21], s[20:21], 16 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: s_bfe_u32 s21, s19, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s19 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s8, s7, s10 -; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s22, s19, s21 +; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s18, v1 +; VI-NEXT: v_readfirstlane_b32 s18, v2 +; VI-NEXT: s_bfe_u32 s19, s18, 0x10010 +; VI-NEXT: s_add_i32 s19, s19, s18 +; VI-NEXT: s_add_i32 s21, s19, 0x7fff +; VI-NEXT: s_or_b32 s23, s18, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s9, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s19, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 -; VI-NEXT: s_add_i32 s8, s8, s7 -; VI-NEXT: s_add_i32 s10, s8, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[18:19], vcc, exec +; VI-NEXT: s_cselect_b32 s18, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_lshr_b64 s[18:19], s[22:23], 16 +; VI-NEXT: s_bfe_u32 s22, s21, 0x10010 +; VI-NEXT: s_add_i32 s22, s22, s21 +; VI-NEXT: s_add_i32 s24, s22, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[8:9], vcc, exec -; VI-NEXT: s_cselect_b32 s8, s7, s10 -; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s22, s21, s24 +; VI-NEXT: s_and_b32 s17, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s21, s17, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s17 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s9, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s18, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s17, s17, s21 +; VI-NEXT: s_lshr_b32 s23, s17, 16 +; VI-NEXT: s_lshl_b32 s17, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s17, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: s_bfe_u32 s21, s17, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s17 +; VI-NEXT: s_lshr_b64 s[22:23], s[22:23], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s17, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s10, s7, s9 -; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s24, s17, s21 +; VI-NEXT: s_and_b32 s16, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s16, v1 +; VI-NEXT: v_readfirstlane_b32 s16, v2 +; VI-NEXT: s_bfe_u32 s17, s16, 0x10010 +; VI-NEXT: s_add_i32 s17, s17, s16 +; VI-NEXT: s_add_i32 s21, s17, 0x7fff +; VI-NEXT: s_or_b32 s23, s16, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s11, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s21, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[16:17], vcc, exec +; VI-NEXT: s_cselect_b32 s16, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s15, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s25, s16, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[16:17], s[24:25], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[10:11], vcc, exec -; VI-NEXT: s_cselect_b32 s10, s7, s9 -; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s24, s21, s23 +; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s21, s15, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s15 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s11, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s20, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s15, s15, s21 +; VI-NEXT: s_lshr_b32 s25, s15, 16 +; VI-NEXT: s_lshl_b32 s15, s14, 16 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s21, s15, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s15 +; VI-NEXT: s_lshr_b64 s[24:25], s[24:25], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s15, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s12, s7, s9 -; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s15, s21 +; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_add_i32 s21, s15, 0x7fff +; VI-NEXT: s_or_b32 s23, s14, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s13, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s23, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s14, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s13, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s27, s14, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[12:13], vcc, exec -; VI-NEXT: s_cselect_b32 s12, s7, s9 -; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s21, s23 +; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s21, s13, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s13 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s13, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s22, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s13, s13, s21 +; VI-NEXT: s_lshr_b32 s27, s13, 16 +; VI-NEXT: s_lshl_b32 s13, s12, 16 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s21, s13, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s13 +; VI-NEXT: s_lshr_b64 s[26:27], s[26:27], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s13, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s13, s21 +; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_add_i32 s21, s13, 0x7fff +; VI-NEXT: s_or_b32 s23, s12, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[22:23], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s25, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s11, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s29, s12, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[14:15], vcc, exec -; VI-NEXT: s_cselect_b32 s14, s7, s9 -; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s21, s23 +; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s21, s11, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s11 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s15, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s24, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s11, s11, s21 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshl_b32 s11, s10, 16 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s21, s11, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s11 +; VI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s11, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec -; VI-NEXT: s_cselect_b32 s40, s7, s9 -; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s40, s11, s21 +; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_add_i32 s21, s11, 0x7fff +; VI-NEXT: s_or_b32 s23, s10, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[24:25], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s41, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s27, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[24:25], s[40:41], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s9, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s41, s10, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[40:41], vcc, exec -; VI-NEXT: s_cselect_b32 s40, s7, s9 -; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s40, s21, s23 +; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s21, s9, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s9 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s41, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s26, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_cselect_b32 s9, s9, s21 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshl_b32 s9, s8, 16 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s21, s9, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s9 ; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_addk_i32 s21, 0x7fff +; VI-NEXT: s_bitset1_b32 s9, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s42, s7, s9 -; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s42, s9, s21 +; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_add_i32 s21, s9, 0x7fff +; VI-NEXT: s_or_b32 s23, s8, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[26:27], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s43, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s29, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s7, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s43, s8, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[8:9], s[42:43], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s42, s7, s9 -; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: s_cselect_b32 s42, s21, s23 +; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s7, v1 ; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bfe_u32 s21, s7, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s7 +; VI-NEXT: s_addk_i32 s21, 0x7fff ; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[44:45], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_cselect_b32 s7, s7, s21 ; VI-NEXT: s_lshr_b32 s43, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: s_lshl_b32 s7, s6, 16 ; VI-NEXT: v_add_f32_e32 v2, s7, v1 ; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_bfe_u32 s21, s7, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s7 ; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_addk_i32 s21, 0x7fff ; VI-NEXT: s_bitset1_b32 s7, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s42, s7, s9 -; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_cselect_b32 s42, s7, s21 +; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s21, s7, 0x7fff +; VI-NEXT: s_or_b32 s23, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b64 s[28:29], vcc, exec -; VI-NEXT: s_cselect_b32 s7, s7, s9 -; VI-NEXT: s_lshr_b32 s43, s7, 16 -; VI-NEXT: s_lshl_b32 s7, s5, 16 -; VI-NEXT: v_add_f32_e32 v2, s7, v1 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 -; VI-NEXT: s_add_i32 s9, s9, s7 -; VI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 -; VI-NEXT: s_addk_i32 s9, 0x7fff -; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s23, s21 +; VI-NEXT: s_lshl_b32 s21, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s21, v1 +; VI-NEXT: v_readfirstlane_b32 s21, v2 +; VI-NEXT: s_bfe_u32 s23, s21, 0x10010 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_add_i32 s23, s23, s21 +; VI-NEXT: s_lshr_b64 s[6:7], s[42:43], 16 +; VI-NEXT: s_addk_i32 s23, 0x7fff +; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_cselect_b32 s42, s21, s23 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s5, v1 ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s5 -; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s5 +; VI-NEXT: s_addk_i32 s21, 0x7fff ; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[44:45], vcc, exec -; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_cselect_b32 s5, s5, s21 ; VI-NEXT: s_lshr_b32 s43, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 ; VI-NEXT: v_add_f32_e32 v2, s5, v1 ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 -; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 +; VI-NEXT: s_add_i32 s21, s21, s5 ; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 -; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_addk_i32 s21, 0x7fff ; VI-NEXT: s_bitset1_b32 s5, 22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b64 s[42:43], vcc, exec -; VI-NEXT: s_cselect_b32 s42, s5, s7 +; VI-NEXT: s_cselect_b32 s42, s5, s21 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 ; VI-NEXT: s_add_i32 s5, s5, s4 -; VI-NEXT: s_add_i32 s7, s5, 0x7fff -; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: s_add_i32 s21, s5, 0x7fff +; VI-NEXT: s_or_b32 s23, s4, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: s_cselect_b32 s4, s23, s21 ; VI-NEXT: s_lshr_b32 s43, s4, 16 ; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 -; VI-NEXT: s_mov_b32 s17, s6 -; VI-NEXT: s_mov_b32 s19, s8 -; VI-NEXT: s_mov_b32 s21, s10 -; VI-NEXT: s_mov_b32 s23, s12 -; VI-NEXT: s_mov_b32 s25, s14 -; VI-NEXT: s_mov_b32 s27, s40 -; VI-NEXT: s_mov_b32 s29, s46 +; VI-NEXT: s_mov_b32 s19, s20 +; VI-NEXT: s_mov_b32 s17, s22 +; VI-NEXT: s_mov_b32 s15, s24 +; VI-NEXT: s_mov_b32 s13, s26 +; VI-NEXT: s_mov_b32 s11, s28 +; VI-NEXT: s_mov_b32 s9, s40 +; VI-NEXT: s_mov_b32 s7, s46 ; VI-NEXT: s_mov_b32 s5, s56 ; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 -; VI-NEXT: s_lshr_b32 s7, s56, 24 -; VI-NEXT: s_lshr_b32 s9, s56, 16 -; VI-NEXT: s_lshr_b32 s11, s56, 8 -; VI-NEXT: s_lshr_b32 s13, s4, 16 -; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 +; VI-NEXT: s_lshr_b32 s21, s56, 24 +; VI-NEXT: s_lshr_b32 s23, s56, 16 +; VI-NEXT: s_lshr_b32 s25, s56, 8 +; VI-NEXT: s_lshr_b32 s27, s4, 16 +; VI-NEXT: s_lshr_b32 s29, s4, 8 ; VI-NEXT: s_lshr_b32 s41, s46, 24 ; VI-NEXT: s_lshr_b32 s47, s46, 16 ; VI-NEXT: s_lshr_b32 s57, s46, 8 -; VI-NEXT: s_lshr_b32 s88, s28, 16 -; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s88, s6, 16 +; VI-NEXT: s_lshr_b32 s89, s6, 8 ; VI-NEXT: s_lshr_b32 s90, s40, 24 ; VI-NEXT: s_lshr_b32 s91, s40, 16 ; VI-NEXT: s_lshr_b32 s30, s40, 8 -; VI-NEXT: s_lshr_b32 s31, s26, 16 -; VI-NEXT: s_lshr_b32 s34, s26, 8 -; VI-NEXT: s_lshr_b32 s35, s14, 24 -; VI-NEXT: s_lshr_b32 s36, s14, 16 -; VI-NEXT: s_lshr_b32 s37, s14, 8 -; VI-NEXT: s_lshr_b32 s38, s24, 16 -; VI-NEXT: s_lshr_b32 s39, s24, 8 -; VI-NEXT: s_lshr_b32 s48, s12, 24 -; VI-NEXT: s_lshr_b32 s49, s12, 16 -; VI-NEXT: s_lshr_b32 s50, s12, 8 -; VI-NEXT: s_lshr_b32 s51, s22, 16 -; VI-NEXT: s_lshr_b32 s52, s22, 8 -; VI-NEXT: s_lshr_b32 s53, s10, 24 -; VI-NEXT: s_lshr_b32 s54, s10, 16 -; VI-NEXT: s_lshr_b32 s55, s10, 8 -; VI-NEXT: s_lshr_b32 s64, s20, 16 -; VI-NEXT: s_lshr_b32 s65, s20, 8 -; VI-NEXT: s_lshr_b32 s66, s8, 24 -; VI-NEXT: s_lshr_b32 s67, s8, 16 -; VI-NEXT: s_lshr_b32 s68, s8, 8 -; VI-NEXT: s_lshr_b32 s69, s18, 16 -; VI-NEXT: s_lshr_b32 s70, s18, 8 -; VI-NEXT: s_lshr_b32 s71, s6, 24 -; VI-NEXT: s_lshr_b32 s80, s6, 16 -; VI-NEXT: s_lshr_b32 s81, s6, 8 -; VI-NEXT: s_lshr_b32 s82, s16, 16 -; VI-NEXT: s_lshr_b32 s83, s16, 8 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s31, s8, 16 +; VI-NEXT: s_lshr_b32 s34, s8, 8 +; VI-NEXT: s_lshr_b32 s35, s28, 24 +; VI-NEXT: s_lshr_b32 s36, s28, 16 +; VI-NEXT: s_lshr_b32 s37, s28, 8 +; VI-NEXT: s_lshr_b32 s38, s10, 16 +; VI-NEXT: s_lshr_b32 s39, s10, 8 +; VI-NEXT: s_lshr_b32 s48, s26, 24 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s50, s26, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s53, s24, 24 +; VI-NEXT: s_lshr_b32 s54, s24, 16 +; VI-NEXT: s_lshr_b32 s55, s24, 8 +; VI-NEXT: s_lshr_b32 s64, s14, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 8 +; VI-NEXT: s_lshr_b32 s66, s22, 24 +; VI-NEXT: s_lshr_b32 s67, s22, 16 +; VI-NEXT: s_lshr_b32 s68, s22, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s71, s20, 24 +; VI-NEXT: s_lshr_b32 s80, s20, 16 +; VI-NEXT: s_lshr_b32 s81, s20, 8 +; VI-NEXT: s_lshr_b32 s82, s18, 16 +; VI-NEXT: s_lshr_b32 s83, s18, 8 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 ; VI-NEXT: .LBB109_3: ; %end -; VI-NEXT: s_and_b32 s5, s16, 0xff -; VI-NEXT: s_lshl_b32 s16, s83, 8 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_lshl_b32 s16, s76, 8 -; VI-NEXT: s_and_b32 s17, s82, 0xff -; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s83, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_and_b32 s9, s82, 0xff +; VI-NEXT: s_or_b32 s7, s9, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s16, s16, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xff -; VI-NEXT: s_lshl_b32 s6, s81, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s80, 0xff -; VI-NEXT: s_lshl_b32 s16, s71, 8 -; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s81, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s80, 0xff +; VI-NEXT: s_lshl_b32 s9, s71, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s70, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s69, 0xff -; VI-NEXT: s_lshl_b32 s16, s74, 8 -; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s70, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s69, 0xff +; VI-NEXT: s_lshl_b32 s9, s74, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s8, 0xff -; VI-NEXT: s_lshl_b32 s6, s68, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s67, 0xff -; VI-NEXT: s_lshl_b32 s8, s66, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s68, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s67, 0xff +; VI-NEXT: s_lshl_b32 s9, s66, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s65, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s64, 0xff -; VI-NEXT: s_lshl_b32 s8, s72, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s7, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s64, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s10, 0xff -; VI-NEXT: s_lshl_b32 s6, s55, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s54, 0xff -; VI-NEXT: s_lshl_b32 s8, s53, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s55, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s54, 0xff +; VI-NEXT: s_lshl_b32 s9, s53, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s22, 0xff -; VI-NEXT: s_lshl_b32 s6, s52, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s51, 0xff -; VI-NEXT: s_lshl_b32 s8, s62, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s7, s52, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s51, 0xff +; VI-NEXT: s_lshl_b32 s9, s62, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s12, 0xff -; VI-NEXT: s_lshl_b32 s6, s50, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s49, 0xff -; VI-NEXT: s_lshl_b32 s8, s48, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s7, s50, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s49, 0xff +; VI-NEXT: s_lshl_b32 s9, s48, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s24, 0xff -; VI-NEXT: s_lshl_b32 s6, s39, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s38, 0xff -; VI-NEXT: s_lshl_b32 s8, s60, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s7, s39, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s38, 0xff +; VI-NEXT: s_lshl_b32 s9, s60, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s14, 0xff -; VI-NEXT: s_lshl_b32 s6, s37, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s36, 0xff -; VI-NEXT: s_lshl_b32 s8, s35, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s7, s37, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s36, 0xff +; VI-NEXT: s_lshl_b32 s9, s35, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s26, 0xff -; VI-NEXT: s_lshl_b32 s6, s34, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s7, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s31, 0xff ; VI-NEXT: s_lshl_b32 s8, s58, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s40, 0xff -; VI-NEXT: s_lshl_b32 s6, s30, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s7, s30, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s91, 0xff ; VI-NEXT: s_lshl_b32 s8, s90, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_and_b32 s5, s6, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff -; VI-NEXT: s_lshl_b32 s8, s44, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_lshl_b32 s7, s44, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 @@ -89093,8 +89698,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s57, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s47, 0xff -; VI-NEXT: s_lshl_b32 s8, s41, 8 -; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 @@ -89102,9 +89707,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_and_b32 s5, s27, 0xff ; VI-NEXT: s_lshl_b32 s6, s42, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -89114,10 +89719,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s56, 0xff -; VI-NEXT: s_lshl_b32 s5, s11, 8 +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s9, 0xff -; VI-NEXT: s_lshl_b32 s6, s7, 8 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -89127,36 +89732,36 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s83, v4, 27 -; VI-NEXT: v_readlane_b32 s82, v4, 26 -; VI-NEXT: v_readlane_b32 s81, v4, 25 -; VI-NEXT: v_readlane_b32 s80, v4, 24 -; VI-NEXT: v_readlane_b32 s71, v4, 23 -; VI-NEXT: v_readlane_b32 s70, v4, 22 -; VI-NEXT: v_readlane_b32 s69, v4, 21 -; VI-NEXT: v_readlane_b32 s68, v4, 20 -; VI-NEXT: v_readlane_b32 s67, v4, 19 -; VI-NEXT: v_readlane_b32 s66, v4, 18 -; VI-NEXT: v_readlane_b32 s65, v4, 17 -; VI-NEXT: v_readlane_b32 s64, v4, 16 -; VI-NEXT: v_readlane_b32 s55, v4, 15 -; VI-NEXT: v_readlane_b32 s54, v4, 14 -; VI-NEXT: v_readlane_b32 s53, v4, 13 -; VI-NEXT: v_readlane_b32 s52, v4, 12 -; VI-NEXT: v_readlane_b32 s51, v4, 11 -; VI-NEXT: v_readlane_b32 s50, v4, 10 -; VI-NEXT: v_readlane_b32 s49, v4, 9 -; VI-NEXT: v_readlane_b32 s48, v4, 8 -; VI-NEXT: v_readlane_b32 s39, v4, 7 -; VI-NEXT: v_readlane_b32 s38, v4, 6 -; VI-NEXT: v_readlane_b32 s37, v4, 5 -; VI-NEXT: v_readlane_b32 s36, v4, 4 -; VI-NEXT: v_readlane_b32 s35, v4, 3 -; VI-NEXT: v_readlane_b32 s34, v4, 2 -; VI-NEXT: v_readlane_b32 s31, v4, 1 -; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: v_readlane_b32 s83, v18, 27 +; VI-NEXT: v_readlane_b32 s82, v18, 26 +; VI-NEXT: v_readlane_b32 s81, v18, 25 +; VI-NEXT: v_readlane_b32 s80, v18, 24 +; VI-NEXT: v_readlane_b32 s71, v18, 23 +; VI-NEXT: v_readlane_b32 s70, v18, 22 +; VI-NEXT: v_readlane_b32 s69, v18, 21 +; VI-NEXT: v_readlane_b32 s68, v18, 20 +; VI-NEXT: v_readlane_b32 s67, v18, 19 +; VI-NEXT: v_readlane_b32 s66, v18, 18 +; VI-NEXT: v_readlane_b32 s65, v18, 17 +; VI-NEXT: v_readlane_b32 s64, v18, 16 +; VI-NEXT: v_readlane_b32 s55, v18, 15 +; VI-NEXT: v_readlane_b32 s54, v18, 14 +; VI-NEXT: v_readlane_b32 s53, v18, 13 +; VI-NEXT: v_readlane_b32 s52, v18, 12 +; VI-NEXT: v_readlane_b32 s51, v18, 11 +; VI-NEXT: v_readlane_b32 s50, v18, 10 +; VI-NEXT: v_readlane_b32 s49, v18, 9 +; VI-NEXT: v_readlane_b32 s48, v18, 8 +; VI-NEXT: v_readlane_b32 s39, v18, 7 +; VI-NEXT: v_readlane_b32 s38, v18, 6 +; VI-NEXT: v_readlane_b32 s37, v18, 5 +; VI-NEXT: v_readlane_b32 s36, v18, 4 +; VI-NEXT: v_readlane_b32 s35, v18, 3 +; VI-NEXT: v_readlane_b32 s34, v18, 2 +; VI-NEXT: v_readlane_b32 s31, v18, 1 +; VI-NEXT: v_readlane_b32 s30, v18, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -89164,35 +89769,35 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr83 ; VI-NEXT: ; implicit-def: $sgpr82 ; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr20 ; VI-NEXT: ; implicit-def: $sgpr81 ; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr69 ; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: ; implicit-def: $sgpr68 ; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 @@ -89210,41 +89815,69 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 ; VI-NEXT: s_branch .LBB109_2 ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v4, s30, 0 -; GFX9-NEXT: v_writelane_b32 v4, s31, 1 -; GFX9-NEXT: v_writelane_b32 v4, s34, 2 -; GFX9-NEXT: v_writelane_b32 v4, s35, 3 -; GFX9-NEXT: v_writelane_b32 v4, s36, 4 -; GFX9-NEXT: v_writelane_b32 v4, s37, 5 -; GFX9-NEXT: v_writelane_b32 v4, s38, 6 -; GFX9-NEXT: v_writelane_b32 v4, s39, 7 -; GFX9-NEXT: v_writelane_b32 v4, s48, 8 -; GFX9-NEXT: v_writelane_b32 v4, s49, 9 -; GFX9-NEXT: v_writelane_b32 v4, s50, 10 -; GFX9-NEXT: v_writelane_b32 v4, s51, 11 -; GFX9-NEXT: v_writelane_b32 v4, s52, 12 -; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_writelane_b32 v18, s30, 0 +; GFX9-NEXT: v_writelane_b32 v18, s31, 1 +; GFX9-NEXT: v_writelane_b32 v18, s34, 2 +; GFX9-NEXT: v_writelane_b32 v18, s35, 3 +; GFX9-NEXT: v_writelane_b32 v18, s36, 4 +; GFX9-NEXT: v_writelane_b32 v18, s37, 5 +; GFX9-NEXT: v_writelane_b32 v18, s38, 6 +; GFX9-NEXT: v_writelane_b32 v18, s39, 7 +; GFX9-NEXT: v_writelane_b32 v18, s48, 8 +; GFX9-NEXT: v_writelane_b32 v18, s49, 9 +; GFX9-NEXT: v_writelane_b32 v18, s50, 10 +; GFX9-NEXT: v_writelane_b32 v18, s51, 11 +; GFX9-NEXT: v_writelane_b32 v18, s52, 12 +; GFX9-NEXT: v_writelane_b32 v18, s53, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v8, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s21 +; GFX9-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_writelane_b32 v18, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s18, v4 +; GFX9-NEXT: v_readfirstlane_b32 s19, v5 +; GFX9-NEXT: v_readfirstlane_b32 s16, v6 +; GFX9-NEXT: v_readfirstlane_b32 s17, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s12, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v11 +; GFX9-NEXT: v_readfirstlane_b32 s10, v12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v13 +; GFX9-NEXT: v_readfirstlane_b32 s8, v14 +; GFX9-NEXT: v_readfirstlane_b32 s9, v15 +; GFX9-NEXT: v_readfirstlane_b32 s6, v16 +; GFX9-NEXT: v_readfirstlane_b32 s7, v17 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: v_writelane_b32 v18, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s92, s5, 24 @@ -89252,425 +89885,425 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s93, s5, 8 ; GFX9-NEXT: s_lshr_b32 s94, s4, 16 ; GFX9-NEXT: s_lshr_b32 s95, s4, 8 -; GFX9-NEXT: s_lshr_b32 s30, s29, 24 -; GFX9-NEXT: s_lshr_b32 s90, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s29, 8 -; GFX9-NEXT: s_lshr_b32 s31, s28, 16 -; GFX9-NEXT: s_lshr_b32 s74, s28, 8 -; GFX9-NEXT: s_lshr_b32 s34, s27, 24 -; GFX9-NEXT: s_lshr_b32 s89, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s27, 8 -; GFX9-NEXT: s_lshr_b32 s35, s26, 16 -; GFX9-NEXT: s_lshr_b32 s72, s26, 8 -; GFX9-NEXT: s_lshr_b32 s36, s25, 24 -; GFX9-NEXT: s_lshr_b32 s88, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s25, 8 -; GFX9-NEXT: s_lshr_b32 s37, s24, 16 -; GFX9-NEXT: s_lshr_b32 s62, s24, 8 -; GFX9-NEXT: s_lshr_b32 s38, s23, 24 -; GFX9-NEXT: s_lshr_b32 s79, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s23, 8 -; GFX9-NEXT: s_lshr_b32 s39, s22, 16 -; GFX9-NEXT: s_lshr_b32 s60, s22, 8 -; GFX9-NEXT: s_lshr_b32 s48, s21, 24 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s21, 8 -; GFX9-NEXT: s_lshr_b32 s49, s20, 16 -; GFX9-NEXT: s_lshr_b32 s58, s20, 8 -; GFX9-NEXT: s_lshr_b32 s50, s19, 24 -; GFX9-NEXT: s_lshr_b32 s77, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s19, 8 -; GFX9-NEXT: s_lshr_b32 s51, s18, 16 -; GFX9-NEXT: s_lshr_b32 s56, s18, 8 -; GFX9-NEXT: s_lshr_b32 s52, s17, 24 -; GFX9-NEXT: s_lshr_b32 s76, s17, 16 -; GFX9-NEXT: s_lshr_b32 s53, s17, 8 -; GFX9-NEXT: s_lshr_b32 s54, s16, 16 -; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s30, s7, 24 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s7, 8 +; GFX9-NEXT: s_lshr_b32 s31, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s6, 8 +; GFX9-NEXT: s_lshr_b32 s34, s9, 24 +; GFX9-NEXT: s_lshr_b32 s89, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s9, 8 +; GFX9-NEXT: s_lshr_b32 s35, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s8, 8 +; GFX9-NEXT: s_lshr_b32 s36, s11, 24 +; GFX9-NEXT: s_lshr_b32 s88, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s11, 8 +; GFX9-NEXT: s_lshr_b32 s37, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s10, 8 +; GFX9-NEXT: s_lshr_b32 s38, s13, 24 +; GFX9-NEXT: s_lshr_b32 s79, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s13, 8 +; GFX9-NEXT: s_lshr_b32 s39, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s12, 8 +; GFX9-NEXT: s_lshr_b32 s48, s15, 24 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s15, 8 +; GFX9-NEXT: s_lshr_b32 s49, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s14, 8 +; GFX9-NEXT: s_lshr_b32 s50, s17, 24 +; GFX9-NEXT: s_lshr_b32 s77, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s51, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s16, 8 +; GFX9-NEXT: s_lshr_b32 s52, s19, 24 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s53, s19, 8 +; GFX9-NEXT: s_lshr_b32 s54, s18, 16 +; GFX9-NEXT: s_lshr_b32 s55, s18, 8 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB109_3 ; GFX9-NEXT: .LBB109_2: ; %cmp.true -; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: s_and_b32 s20, s19, 0xffff0000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s76, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s17, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s19, v1 +; GFX9-NEXT: v_readfirstlane_b32 s19, v2 +; GFX9-NEXT: s_lshr_b32 s76, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s19, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s19 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s19, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s17, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s19, s19, s22 +; GFX9-NEXT: s_and_b32 s20, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s19, s19, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s16, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s18, v1 +; GFX9-NEXT: v_readfirstlane_b32 s18, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s18, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s18 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s18, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s16, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s18, s18, s23 +; GFX9-NEXT: s_and_b32 s20, s17, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s18, s18, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s18, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s19, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s17, v1 +; GFX9-NEXT: v_readfirstlane_b32 s17, v2 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s17, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s17 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s17, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s19, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s17, s17, s22 +; GFX9-NEXT: s_and_b32 s20, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s17, s17, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s18, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s16, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s16 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s16, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s18, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s16, s16, s23 +; GFX9-NEXT: s_and_b32 s20, s15, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s16, s16, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s16, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s78, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s21, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s15, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_lshr_b32 s78, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s15, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s15 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s15, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s21, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s15, s15, s22 +; GFX9-NEXT: s_and_b32 s20, s14, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s15, s15, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s20, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s14, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s14 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s14, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s20, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s14, s14, s23 +; GFX9-NEXT: s_and_b32 s20, s13, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s14, s14, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s14, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s79, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s23, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s13, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s13, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s13 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s13, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s23, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s13, s13, s22 +; GFX9-NEXT: s_and_b32 s20, s12, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s13, s13, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s22, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s12, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s12 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s12, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s22, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s12, s12, s23 +; GFX9-NEXT: s_and_b32 s20, s11, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s12, s12, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s12, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s88, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s25, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s11, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v2 +; GFX9-NEXT: s_lshr_b32 s88, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s11, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s11 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s11, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s25, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s11, s11, s22 +; GFX9-NEXT: s_and_b32 s20, s10, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s11, s11, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s24, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s10, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s10 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s10, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s24, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s10, s10, s23 +; GFX9-NEXT: s_and_b32 s20, s9, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s10, s10, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s10, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s89, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s27, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s9, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_lshr_b32 s89, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s9, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s9 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s9, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s27, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s9, s9, s22 +; GFX9-NEXT: s_and_b32 s20, s8, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s9, s9, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s26, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s8 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s8, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s8, s23 +; GFX9-NEXT: s_and_b32 s20, s7, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s8, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s90, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s29, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: s_lshr_b32 s90, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s7, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s7 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s7, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s29, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s7, s7, s22 +; GFX9-NEXT: s_and_b32 s20, s6, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_lshr_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s6 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s28, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s23 +; GFX9-NEXT: s_and_b32 s20, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s21, s21, s20 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s6, s22 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_lshr_b32 s91, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s5 +; GFX9-NEXT: s_add_i32 s22, s20, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s22 +; GFX9-NEXT: s_and_b32 s20, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 +; GFX9-NEXT: v_readfirstlane_b32 s20, v2 +; GFX9-NEXT: s_bfe_u32 s21, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s21, s21, s20 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_add_i32 s22, s21, 0x7fff +; GFX9-NEXT: s_or_b32 s23, s20, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s23, s22 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s4 -; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff +; GFX9-NEXT: s_lshr_b32 s22, s20, 16 +; GFX9-NEXT: s_bfe_u32 s20, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s20, s20, s4 +; GFX9-NEXT: s_add_i32 s23, s20, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 +; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s23 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s17, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s13, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s11, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s9, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s7, s90 ; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s22 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[30:31], 24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[60:61], 24 ; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 ; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 ; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 @@ -89707,165 +90340,165 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s54, s46, 16 ; GFX9-NEXT: s_lshr_b32 s55, s46, 8 ; GFX9-NEXT: .LBB109_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s76, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s52, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s55, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: s_and_b32 s21, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s44, 8 +; GFX9-NEXT: s_or_b32 s21, s21, s23 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s21, s21, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s21 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s18, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s53, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s19, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s52, 8 +; GFX9-NEXT: s_or_b32 s19, s19, s21 +; GFX9-NEXT: s_and_b32 s18, s18, 0xffff +; GFX9-NEXT: s_lshl_b32 s19, s19, 16 +; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s56, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s51, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s56, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s18 +; GFX9-NEXT: s_and_b32 s18, s51, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s42, 8 +; GFX9-NEXT: s_or_b32 s18, s18, s19 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s18, s18, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s57, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s50, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s57, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s50, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s58, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s58, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: s_and_b32 s16, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s40, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s59, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s78, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s48, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s59, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s15, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s48, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s60, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: s_and_b32 s14, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s28, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s79, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s61, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s13, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s38, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s62, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s37, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s62, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s12, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s26, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s36, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s63, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s11, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s36, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s72, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s35, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s72, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s10, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s24, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s9, s89, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s34, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s74, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s31, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s75, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s90, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s74, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s8, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s22, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s30, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s95, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s20, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -89883,24 +90516,24 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v4, 15 -; GFX9-NEXT: v_readlane_b32 s54, v4, 14 -; GFX9-NEXT: v_readlane_b32 s53, v4, 13 -; GFX9-NEXT: v_readlane_b32 s52, v4, 12 -; GFX9-NEXT: v_readlane_b32 s51, v4, 11 -; GFX9-NEXT: v_readlane_b32 s50, v4, 10 -; GFX9-NEXT: v_readlane_b32 s49, v4, 9 -; GFX9-NEXT: v_readlane_b32 s48, v4, 8 -; GFX9-NEXT: v_readlane_b32 s39, v4, 7 -; GFX9-NEXT: v_readlane_b32 s38, v4, 6 -; GFX9-NEXT: v_readlane_b32 s37, v4, 5 -; GFX9-NEXT: v_readlane_b32 s36, v4, 4 -; GFX9-NEXT: v_readlane_b32 s35, v4, 3 -; GFX9-NEXT: v_readlane_b32 s34, v4, 2 -; GFX9-NEXT: v_readlane_b32 s31, v4, 1 -; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: v_readlane_b32 s55, v18, 15 +; GFX9-NEXT: v_readlane_b32 s54, v18, 14 +; GFX9-NEXT: v_readlane_b32 s53, v18, 13 +; GFX9-NEXT: v_readlane_b32 s52, v18, 12 +; GFX9-NEXT: v_readlane_b32 s51, v18, 11 +; GFX9-NEXT: v_readlane_b32 s50, v18, 10 +; GFX9-NEXT: v_readlane_b32 s49, v18, 9 +; GFX9-NEXT: v_readlane_b32 s48, v18, 8 +; GFX9-NEXT: v_readlane_b32 s39, v18, 7 +; GFX9-NEXT: v_readlane_b32 s38, v18, 6 +; GFX9-NEXT: v_readlane_b32 s37, v18, 5 +; GFX9-NEXT: v_readlane_b32 s36, v18, 4 +; GFX9-NEXT: v_readlane_b32 s35, v18, 3 +; GFX9-NEXT: v_readlane_b32 s34, v18, 2 +; GFX9-NEXT: v_readlane_b32 s31, v18, 1 +; GFX9-NEXT: v_readlane_b32 s30, v18, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -89925,31 +90558,31 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr37 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr31 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr22 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr20 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr92 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 7bd2c7a628ebd..fe226fa0bb47f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -2759,169 +2759,197 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v18i32_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s20, v6 +; SI-NEXT: v_readfirstlane_b32 s21, v7 +; SI-NEXT: v_readfirstlane_b32 s18, v8 +; SI-NEXT: v_readfirstlane_b32 s19, v9 +; SI-NEXT: v_readfirstlane_b32 s16, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v13 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_readfirstlane_b32 s13, v15 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_readfirstlane_b32 s11, v17 +; SI-NEXT: v_readfirstlane_b32 s8, v18 +; SI-NEXT: v_readfirstlane_b32 s9, v19 ; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[22:23], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v4 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s60, s5, 16 ; SI-NEXT: s_lshr_b32 s61, s7, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_lshr_b32 s60, s5, 16 ; SI-NEXT: s_lshr_b32 s61, s7, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s9, s56, 16 -; SI-NEXT: s_and_b32 s11, s16, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: s_and_b32 s9, s17, 0xffff -; SI-NEXT: s_lshl_b32 s11, s76, 16 -; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s18, 0xffff -; SI-NEXT: s_lshl_b32 s11, s46, 16 +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s46, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s19, 0xffff -; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s75, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s20, 0xffff -; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s21, 0xffff -; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s22, 0xffff -; SI-NEXT: s_lshl_b32 s11, s42, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s23, 0xffff -; SI-NEXT: s_lshl_b32 s11, s73, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s73, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s24, 0xffff -; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s25, 0xffff -; SI-NEXT: s_lshl_b32 s11, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff ; SI-NEXT: s_lshl_b32 s11, s63, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s9, s10, 16 +; SI-NEXT: s_lshl_b32 s8, s24, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -2933,7 +2961,7 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_lshl_b32 s6, s22, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -2960,20 +2988,48 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v18i32_to_v36i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s23, v5 +; VI-NEXT: v_readfirstlane_b32 s22, v6 +; VI-NEXT: v_readfirstlane_b32 s21, v7 +; VI-NEXT: v_readfirstlane_b32 s20, v8 +; VI-NEXT: v_readfirstlane_b32 s19, v9 +; VI-NEXT: v_readfirstlane_b32 s18, v10 +; VI-NEXT: v_readfirstlane_b32 s17, v11 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_readfirstlane_b32 s15, v13 +; VI-NEXT: v_readfirstlane_b32 s14, v14 +; VI-NEXT: v_readfirstlane_b32 s13, v15 +; VI-NEXT: v_readfirstlane_b32 s12, v16 +; VI-NEXT: v_readfirstlane_b32 s11, v17 +; VI-NEXT: v_readfirstlane_b32 s10, v18 ; VI-NEXT: v_readfirstlane_b32 s9, v0 ; VI-NEXT: v_readfirstlane_b32 s8, v1 ; VI-NEXT: v_readfirstlane_b32 s6, v2 @@ -2981,131 +3037,131 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v3 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s6, s6, 3 ; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s5, s59, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s58, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s57, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s56, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s47, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s46, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s45, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s22, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s22 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s22, s57, 16 ; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s43, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s42, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s41, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s40, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s15, s26, s15 -; VI-NEXT: s_and_b32 s26, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s22, s47, 16 +; VI-NEXT: s_or_b32 s19, s19, s22 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s22, s46, 16 +; VI-NEXT: s_or_b32 s18, s18, s22 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s22, s45, 16 +; VI-NEXT: s_or_b32 s17, s17, s22 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s16, s16, s22 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s22, s43, 16 +; VI-NEXT: s_or_b32 s15, s15, s22 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s22, s42, 16 +; VI-NEXT: s_or_b32 s14, s14, s22 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s22, s41, 16 +; VI-NEXT: s_or_b32 s13, s13, s22 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s22, s40, 16 +; VI-NEXT: s_or_b32 s12, s12, s22 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s22, s29, 16 +; VI-NEXT: s_or_b32 s11, s11, s22 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s22, s28, 16 +; VI-NEXT: s_or_b32 s10, s10, s22 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s22, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s22 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_lshl_b32 s22, s26, 16 +; VI-NEXT: s_or_b32 s8, s8, s22 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_lshl_b32 s22, s25, 16 +; VI-NEXT: s_or_b32 s6, s6, s22 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s14, s26, s14 -; VI-NEXT: s_or_b32 s9, s9, s13 -; VI-NEXT: s_or_b32 s8, s8, s12 -; VI-NEXT: s_or_b32 s6, s6, s11 -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_lshl_b32 s22, s24, 16 +; VI-NEXT: s_or_b32 s7, s7, s22 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s15 -; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s10 ; VI-NEXT: v_mov_b32_e32 v14, s9 ; VI-NEXT: v_mov_b32_e32 v15, s8 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -3124,55 +3180,73 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v18i32_to_v36i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s7, v6 +; GFX9-NEXT: v_readfirstlane_b32 s8, v7 +; GFX9-NEXT: v_readfirstlane_b32 s9, v8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v9 +; GFX9-NEXT: v_readfirstlane_b32 s11, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s14, v13 +; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_add_i32 s25, s25, 3 -; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_add_i32 s23, s23, 3 ; GFX9-NEXT: s_add_i32 s22, s22, 3 ; GFX9-NEXT: s_add_i32 s21, s21, 3 @@ -3181,61 +3255,71 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s26 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s25 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s15 -; GFX9-NEXT: v_mov_b32_e32 v13, s14 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -3250,12 +3334,12 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v18i32_to_v36i16_scalar: @@ -5438,7 +5522,35 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-LABEL: bitcast_v18i32_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s23, v6 +; SI-NEXT: v_readfirstlane_b32 s22, v7 +; SI-NEXT: v_readfirstlane_b32 s21, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s19, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s13, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s11, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 ; SI-NEXT: v_readfirstlane_b32 s8, v1 ; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: v_readfirstlane_b32 s6, v3 @@ -5454,86 +5566,86 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 16 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 16 -; SI-NEXT: s_lshr_b32 s40, s24, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_lshr_b32 s42, s26, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_lshr_b32 s44, s28, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s5, s22, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s25, s20, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s18, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b32 s29, s16, 16 +; SI-NEXT: s_lshr_b32 s40, s15, 16 +; SI-NEXT: s_lshr_b32 s41, s14, 16 +; SI-NEXT: s_lshr_b32 s42, s13, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s45, s10, 16 ; SI-NEXT: s_lshr_b32 s46, s8, 16 ; SI-NEXT: s_lshr_b32 s47, s7, 16 ; SI-NEXT: s_lshr_b32 s56, s6, 16 @@ -5542,20 +5654,20 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 @@ -5566,12 +5678,12 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: .LBB17_3: ; %end @@ -5743,7 +5855,35 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s23, v5 +; VI-NEXT: v_readfirstlane_b32 s22, v6 +; VI-NEXT: v_readfirstlane_b32 s21, v7 +; VI-NEXT: v_readfirstlane_b32 s20, v8 +; VI-NEXT: v_readfirstlane_b32 s19, v9 +; VI-NEXT: v_readfirstlane_b32 s18, v10 +; VI-NEXT: v_readfirstlane_b32 s17, v11 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_readfirstlane_b32 s15, v13 +; VI-NEXT: v_readfirstlane_b32 s14, v14 +; VI-NEXT: v_readfirstlane_b32 s13, v15 +; VI-NEXT: v_readfirstlane_b32 s12, v16 +; VI-NEXT: v_readfirstlane_b32 s11, v17 +; VI-NEXT: v_readfirstlane_b32 s10, v18 ; VI-NEXT: v_readfirstlane_b32 s9, v0 ; VI-NEXT: v_readfirstlane_b32 s8, v1 ; VI-NEXT: v_readfirstlane_b32 s6, v2 @@ -5751,131 +5891,131 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v3 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s6, s6, 3 ; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s5, s59, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s58, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s57, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s56, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s47, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s46, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s45, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s22, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s22 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s22, s57, 16 ; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s43, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s42, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s41, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s40, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s15, s26, s15 -; VI-NEXT: s_and_b32 s26, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s22, s47, 16 +; VI-NEXT: s_or_b32 s19, s19, s22 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s22, s46, 16 +; VI-NEXT: s_or_b32 s18, s18, s22 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s22, s45, 16 +; VI-NEXT: s_or_b32 s17, s17, s22 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s16, s16, s22 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s22, s43, 16 +; VI-NEXT: s_or_b32 s15, s15, s22 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s22, s42, 16 +; VI-NEXT: s_or_b32 s14, s14, s22 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s22, s41, 16 +; VI-NEXT: s_or_b32 s13, s13, s22 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s22, s40, 16 +; VI-NEXT: s_or_b32 s12, s12, s22 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s22, s29, 16 +; VI-NEXT: s_or_b32 s11, s11, s22 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s22, s28, 16 +; VI-NEXT: s_or_b32 s10, s10, s22 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s22, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s22 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_lshl_b32 s22, s26, 16 +; VI-NEXT: s_or_b32 s8, s8, s22 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_lshl_b32 s22, s25, 16 +; VI-NEXT: s_or_b32 s6, s6, s22 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s14, s26, s14 -; VI-NEXT: s_or_b32 s9, s9, s13 -; VI-NEXT: s_or_b32 s8, s8, s12 -; VI-NEXT: s_or_b32 s6, s6, s11 -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_lshl_b32 s22, s24, 16 +; VI-NEXT: s_or_b32 s7, s7, s22 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s15 -; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s10 ; VI-NEXT: v_mov_b32_e32 v14, s9 ; VI-NEXT: v_mov_b32_e32 v15, s8 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -5894,55 +6034,73 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v18i32_to_v36f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s7, v6 +; GFX9-NEXT: v_readfirstlane_b32 s8, v7 +; GFX9-NEXT: v_readfirstlane_b32 s9, v8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v9 +; GFX9-NEXT: v_readfirstlane_b32 s11, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s14, v13 +; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 ; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_add_i32 s25, s25, 3 -; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_add_i32 s23, s23, 3 ; GFX9-NEXT: s_add_i32 s22, s22, 3 ; GFX9-NEXT: s_add_i32 s21, s21, 3 @@ -5951,61 +6109,71 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s26 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s25 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s15 -; GFX9-NEXT: v_mov_b32_e32 v13, s14 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -6020,12 +6188,12 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: s_branch .LBB17_2 ; ; GFX11-LABEL: bitcast_v18i32_to_v36f16_scalar: @@ -12552,277 +12720,306 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, s16 +; SI-NEXT: v_mov_b32_e32 v45, s17 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v43, s19 +; SI-NEXT: v_mov_b32_e32 v42, s20 +; SI-NEXT: v_mov_b32_e32 v41, s21 +; SI-NEXT: v_mov_b32_e32 v40, s22 +; SI-NEXT: v_mov_b32_e32 v55, s23 +; SI-NEXT: v_mov_b32_e32 v54, s24 +; SI-NEXT: v_mov_b32_e32 v53, s25 +; SI-NEXT: v_mov_b32_e32 v51, s26 +; SI-NEXT: v_mov_b32_e32 v50, s27 +; SI-NEXT: v_mov_b32_e32 v49, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_mov_b32_e32 v52, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v46 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v41 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v40 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -12834,10 +13031,10 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -12846,11 +13043,7 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: @@ -16237,169 +16430,197 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v9i64_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s20, v6 +; SI-NEXT: v_readfirstlane_b32 s21, v7 +; SI-NEXT: v_readfirstlane_b32 s18, v8 +; SI-NEXT: v_readfirstlane_b32 s19, v9 +; SI-NEXT: v_readfirstlane_b32 s16, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v13 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_readfirstlane_b32 s13, v15 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_readfirstlane_b32 s11, v17 +; SI-NEXT: v_readfirstlane_b32 s8, v18 +; SI-NEXT: v_readfirstlane_b32 s9, v19 ; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[22:23], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v4 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s60, s5, 16 ; SI-NEXT: s_lshr_b32 s61, s7, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 ; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 ; SI-NEXT: s_addc_u32 s7, s7, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s60, s5, 16 ; SI-NEXT: s_lshr_b32 s61, s7, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 ; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s9, s56, 16 -; SI-NEXT: s_and_b32 s11, s16, 0xffff -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: s_and_b32 s9, s17, 0xffff -; SI-NEXT: s_lshl_b32 s11, s76, 16 -; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s18, 0xffff -; SI-NEXT: s_lshl_b32 s11, s46, 16 +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s46, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s19, 0xffff -; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s75, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s20, 0xffff -; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s21, 0xffff -; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s22, 0xffff -; SI-NEXT: s_lshl_b32 s11, s42, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s23, 0xffff -; SI-NEXT: s_lshl_b32 s11, s73, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s73, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s24, 0xffff -; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s25, 0xffff -; SI-NEXT: s_lshl_b32 s11, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff ; SI-NEXT: s_lshl_b32 s11, s63, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s9, s10, 16 +; SI-NEXT: s_lshl_b32 s8, s24, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -16411,7 +16632,7 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_lshl_b32 s6, s22, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -16438,20 +16659,48 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v9i64_to_v36i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s23, v5 +; VI-NEXT: v_readfirstlane_b32 s22, v6 +; VI-NEXT: v_readfirstlane_b32 s21, v7 +; VI-NEXT: v_readfirstlane_b32 s20, v8 +; VI-NEXT: v_readfirstlane_b32 s19, v9 +; VI-NEXT: v_readfirstlane_b32 s18, v10 +; VI-NEXT: v_readfirstlane_b32 s17, v11 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_readfirstlane_b32 s15, v13 +; VI-NEXT: v_readfirstlane_b32 s14, v14 +; VI-NEXT: v_readfirstlane_b32 s13, v15 +; VI-NEXT: v_readfirstlane_b32 s12, v16 +; VI-NEXT: v_readfirstlane_b32 s11, v17 +; VI-NEXT: v_readfirstlane_b32 s10, v18 ; VI-NEXT: v_readfirstlane_b32 s9, v0 ; VI-NEXT: v_readfirstlane_b32 s8, v1 ; VI-NEXT: v_readfirstlane_b32 s6, v2 @@ -16459,131 +16708,131 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; VI-NEXT: v_readfirstlane_b32 s7, v3 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 ; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s9, s9, 3 ; VI-NEXT: s_addc_u32 s8, s8, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s5, s59, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s58, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s57, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s56, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s47, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s46, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s45, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s22, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s22 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s22, s57, 16 ; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s43, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s42, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s41, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s40, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s15, s26, s15 -; VI-NEXT: s_and_b32 s26, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s22, s47, 16 +; VI-NEXT: s_or_b32 s19, s19, s22 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s22, s46, 16 +; VI-NEXT: s_or_b32 s18, s18, s22 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s22, s45, 16 +; VI-NEXT: s_or_b32 s17, s17, s22 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s16, s16, s22 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s22, s43, 16 +; VI-NEXT: s_or_b32 s15, s15, s22 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s22, s42, 16 +; VI-NEXT: s_or_b32 s14, s14, s22 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s22, s41, 16 +; VI-NEXT: s_or_b32 s13, s13, s22 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s22, s40, 16 +; VI-NEXT: s_or_b32 s12, s12, s22 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s22, s29, 16 +; VI-NEXT: s_or_b32 s11, s11, s22 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s22, s28, 16 +; VI-NEXT: s_or_b32 s10, s10, s22 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s22, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s22 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_lshl_b32 s22, s26, 16 +; VI-NEXT: s_or_b32 s8, s8, s22 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_lshl_b32 s22, s25, 16 +; VI-NEXT: s_or_b32 s6, s6, s22 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s14, s26, s14 -; VI-NEXT: s_or_b32 s9, s9, s13 -; VI-NEXT: s_or_b32 s8, s8, s12 -; VI-NEXT: s_or_b32 s6, s6, s11 -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_lshl_b32 s22, s24, 16 +; VI-NEXT: s_or_b32 s7, s7, s22 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s15 -; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s10 ; VI-NEXT: v_mov_b32_e32 v14, s9 ; VI-NEXT: v_mov_b32_e32 v15, s8 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -16602,55 +16851,73 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v9i64_to_v36i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s7, v6 +; GFX9-NEXT: v_readfirstlane_b32 s8, v7 +; GFX9-NEXT: v_readfirstlane_b32 s9, v8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v9 +; GFX9-NEXT: v_readfirstlane_b32 s11, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s14, v13 +; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 ; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 -; GFX9-NEXT: s_add_u32 s24, s24, 3 -; GFX9-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-NEXT: s_add_u32 s22, s22, 3 ; GFX9-NEXT: s_addc_u32 s23, s23, 0 ; GFX9-NEXT: s_add_u32 s20, s20, 3 @@ -16659,61 +16926,71 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s26 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s25 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s15 -; GFX9-NEXT: v_mov_b32_e32 v13, s14 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB41_4: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -16728,12 +17005,12 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: s_branch .LBB41_2 ; ; GFX11-LABEL: bitcast_v9i64_to_v36i16_scalar: @@ -18926,7 +19203,35 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v9i64_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s22, v6 +; SI-NEXT: v_readfirstlane_b32 s23, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v8 +; SI-NEXT: v_readfirstlane_b32 s21, v9 +; SI-NEXT: v_readfirstlane_b32 s18, v10 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v16 +; SI-NEXT: v_readfirstlane_b32 s13, v17 +; SI-NEXT: v_readfirstlane_b32 s10, v18 +; SI-NEXT: v_readfirstlane_b32 s11, v19 ; SI-NEXT: v_readfirstlane_b32 s7, v1 ; SI-NEXT: v_readfirstlane_b32 s8, v2 ; SI-NEXT: v_readfirstlane_b32 s6, v3 @@ -18942,83 +19247,83 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s10, s4, 16 -; SI-NEXT: s_lshr_b32 s11, s5, 16 -; SI-NEXT: s_add_u32 s12, s18, 3 -; SI-NEXT: s_addc_u32 s13, s19, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s20, 3 -; SI-NEXT: s_addc_u32 s17, s21, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s22, 3 -; SI-NEXT: s_addc_u32 s21, s23, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s40, s24, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s42, s26, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s44, s28, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_addc_u32 s5, s23, 0 +; SI-NEXT: s_lshr_b32 s22, s4, 16 +; SI-NEXT: s_lshr_b32 s23, s5, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s24, s20, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s26, s18, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s28, s16, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s40, s14, 16 +; SI-NEXT: s_lshr_b32 s41, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s42, s12, 16 +; SI-NEXT: s_lshr_b32 s43, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 ; SI-NEXT: s_addc_u32 s8, s8, 0 ; SI-NEXT: s_lshr_b32 s46, s7, 16 ; SI-NEXT: s_lshr_b32 s47, s8, 16 @@ -19030,18 +19335,18 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 @@ -19054,14 +19359,14 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 @@ -19231,7 +19536,35 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s24 +; VI-NEXT: v_mov_b32_e32 v14, s25 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s23, v5 +; VI-NEXT: v_readfirstlane_b32 s22, v6 +; VI-NEXT: v_readfirstlane_b32 s21, v7 +; VI-NEXT: v_readfirstlane_b32 s20, v8 +; VI-NEXT: v_readfirstlane_b32 s19, v9 +; VI-NEXT: v_readfirstlane_b32 s18, v10 +; VI-NEXT: v_readfirstlane_b32 s17, v11 +; VI-NEXT: v_readfirstlane_b32 s16, v12 +; VI-NEXT: v_readfirstlane_b32 s15, v13 +; VI-NEXT: v_readfirstlane_b32 s14, v14 +; VI-NEXT: v_readfirstlane_b32 s13, v15 +; VI-NEXT: v_readfirstlane_b32 s12, v16 +; VI-NEXT: v_readfirstlane_b32 s11, v17 +; VI-NEXT: v_readfirstlane_b32 s10, v18 ; VI-NEXT: v_readfirstlane_b32 s9, v0 ; VI-NEXT: v_readfirstlane_b32 s8, v1 ; VI-NEXT: v_readfirstlane_b32 s6, v2 @@ -19239,131 +19572,131 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; VI-NEXT: v_readfirstlane_b32 s7, v3 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 ; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s9, s9, 3 ; VI-NEXT: s_addc_u32 s8, s8, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s10, s7, 16 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s8, 16 -; VI-NEXT: s_lshr_b32 s13, s9, 16 -; VI-NEXT: s_lshr_b32 s14, s29, 16 -; VI-NEXT: s_lshr_b32 s15, s28, 16 -; VI-NEXT: s_lshr_b32 s40, s27, 16 -; VI-NEXT: s_lshr_b32 s41, s26, 16 -; VI-NEXT: s_lshr_b32 s42, s25, 16 -; VI-NEXT: s_lshr_b32 s43, s24, 16 -; VI-NEXT: s_lshr_b32 s44, s23, 16 -; VI-NEXT: s_lshr_b32 s45, s22, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: s_lshr_b32 s56, s19, 16 -; VI-NEXT: s_lshr_b32 s57, s18, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_lshr_b32 s24, s7, 16 +; VI-NEXT: s_lshr_b32 s25, s6, 16 +; VI-NEXT: s_lshr_b32 s26, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s9, 16 +; VI-NEXT: s_lshr_b32 s28, s10, 16 +; VI-NEXT: s_lshr_b32 s29, s11, 16 +; VI-NEXT: s_lshr_b32 s40, s12, 16 +; VI-NEXT: s_lshr_b32 s41, s13, 16 +; VI-NEXT: s_lshr_b32 s42, s14, 16 +; VI-NEXT: s_lshr_b32 s43, s15, 16 +; VI-NEXT: s_lshr_b32 s44, s16, 16 +; VI-NEXT: s_lshr_b32 s45, s17, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 16 +; VI-NEXT: s_lshr_b32 s59, s23, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s5, s59, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s58, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s57, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s56, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s47, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s46, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s45, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s22, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s22 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s22, s57, 16 ; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s43, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s42, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s41, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s40, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s15, s15, 16 -; VI-NEXT: s_or_b32 s15, s26, s15 -; VI-NEXT: s_and_b32 s26, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s22, s47, 16 +; VI-NEXT: s_or_b32 s19, s19, s22 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s22, s46, 16 +; VI-NEXT: s_or_b32 s18, s18, s22 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s22, s45, 16 +; VI-NEXT: s_or_b32 s17, s17, s22 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s16, s16, s22 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s22, s43, 16 +; VI-NEXT: s_or_b32 s15, s15, s22 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s22, s42, 16 +; VI-NEXT: s_or_b32 s14, s14, s22 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s22, s41, 16 +; VI-NEXT: s_or_b32 s13, s13, s22 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s22, s40, 16 +; VI-NEXT: s_or_b32 s12, s12, s22 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s22, s29, 16 +; VI-NEXT: s_or_b32 s11, s11, s22 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s22, s28, 16 +; VI-NEXT: s_or_b32 s10, s10, s22 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s22, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s22 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_lshl_b32 s22, s26, 16 +; VI-NEXT: s_or_b32 s8, s8, s22 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_lshl_b32 s22, s25, 16 +; VI-NEXT: s_or_b32 s6, s6, s22 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_or_b32 s14, s26, s14 -; VI-NEXT: s_or_b32 s9, s9, s13 -; VI-NEXT: s_or_b32 s8, s8, s12 -; VI-NEXT: s_or_b32 s6, s6, s11 -; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_lshl_b32 s22, s24, 16 +; VI-NEXT: s_or_b32 s7, s7, s22 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s15 -; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s10 ; VI-NEXT: v_mov_b32_e32 v14, s9 ; VI-NEXT: v_mov_b32_e32 v15, s8 ; VI-NEXT: v_mov_b32_e32 v16, s6 @@ -19382,55 +19715,73 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr24 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v9i64_to_v36f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v13, s24 +; GFX9-NEXT: v_mov_b32_e32 v14, s25 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s7, v6 +; GFX9-NEXT: v_readfirstlane_b32 s8, v7 +; GFX9-NEXT: v_readfirstlane_b32 s9, v8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v9 +; GFX9-NEXT: v_readfirstlane_b32 s11, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s14, v13 +; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 ; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 -; GFX9-NEXT: s_add_u32 s24, s24, 3 -; GFX9-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-NEXT: s_add_u32 s22, s22, 3 ; GFX9-NEXT: s_addc_u32 s23, s23, 0 ; GFX9-NEXT: s_add_u32 s20, s20, 3 @@ -19439,61 +19790,71 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_lshr_b32 s14, s29, 16 -; GFX9-NEXT: s_lshr_b32 s15, s28, 16 -; GFX9-NEXT: s_lshr_b32 s40, s27, 16 -; GFX9-NEXT: s_lshr_b32 s41, s26, 16 -; GFX9-NEXT: s_lshr_b32 s42, s25, 16 -; GFX9-NEXT: s_lshr_b32 s43, s24, 16 -; GFX9-NEXT: s_lshr_b32 s44, s23, 16 -; GFX9-NEXT: s_lshr_b32 s45, s22, 16 -; GFX9-NEXT: s_lshr_b32 s46, s21, 16 -; GFX9-NEXT: s_lshr_b32 s47, s20, 16 -; GFX9-NEXT: s_lshr_b32 s56, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s18, 16 -; GFX9-NEXT: s_lshr_b32 s58, s17, 16 -; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s24, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s27, s20, 16 +; GFX9-NEXT: s_lshr_b32 s28, s19, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 16 +; GFX9-NEXT: s_lshr_b32 s40, s17, 16 +; GFX9-NEXT: s_lshr_b32 s41, s16, 16 +; GFX9-NEXT: s_lshr_b32 s42, s15, 16 +; GFX9-NEXT: s_lshr_b32 s43, s14, 16 +; GFX9-NEXT: s_lshr_b32 s44, s13, 16 +; GFX9-NEXT: s_lshr_b32 s45, s12, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_lshr_b32 s47, s10, 16 +; GFX9-NEXT: s_lshr_b32 s56, s9, 16 +; GFX9-NEXT: s_lshr_b32 s57, s8, 16 +; GFX9-NEXT: s_lshr_b32 s58, s7, 16 +; GFX9-NEXT: s_lshr_b32 s59, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s26 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s25 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s15 -; GFX9-NEXT: v_mov_b32_e32 v13, s14 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB45_4: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -19508,12 +19869,12 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr24 ; GFX9-NEXT: s_branch .LBB45_2 ; ; GFX11-LABEL: bitcast_v9i64_to_v36f16_scalar: @@ -24437,296 +24798,325 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s8, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: s_lshr_b32 s8, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 8964ebd9cbd70..78c1971c50d14 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -2837,177 +2837,205 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v20i32_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v8 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v9 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v10 +; SI-NEXT: v_readfirstlane_b32 s21, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s12, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v19 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 ; SI-NEXT: v_readfirstlane_b32 s8, v1 ; SI-NEXT: v_readfirstlane_b32 s9, v2 ; SI-NEXT: v_readfirstlane_b32 s6, v3 ; SI-NEXT: v_readfirstlane_b32 s7, v4 ; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_and_b64 s[24:25], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s73, s7, 16 ; SI-NEXT: s_lshr_b32 s74, s9, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s27, 16 -; SI-NEXT: s_lshr_b32 s77, s25, 16 -; SI-NEXT: s_lshr_b32 s78, s23, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s17, 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s73, s7, 16 ; SI-NEXT: s_lshr_b32 s74, s9, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s27, 16 -; SI-NEXT: s_lshr_b32 s77, s25, 16 -; SI-NEXT: s_lshr_b32 s78, s23, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s17, 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s11, s60, 16 -; SI-NEXT: s_and_b32 s13, s16, 0xffff -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b32 s11, s17, 0xffff -; SI-NEXT: s_lshl_b32 s13, s89, 16 -; SI-NEXT: s_or_b32 s11, s11, s13 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_lshl_b32 s11, s58, 16 -; SI-NEXT: s_and_b32 s13, s18, 0xffff -; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s89, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s22, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s19, 0xffff -; SI-NEXT: s_lshl_b32 s13, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s20, 0xffff -; SI-NEXT: s_lshl_b32 s13, s56, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s21, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s79, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s22, 0xffff -; SI-NEXT: s_lshl_b32 s13, s46, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s23, 0xffff -; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s24, 0xffff -; SI-NEXT: s_lshl_b32 s13, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s25, 0xffff -; SI-NEXT: s_lshl_b32 s13, s77, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s77, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s26, 0xffff -; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff ; SI-NEXT: s_lshl_b32 s13, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s28, 0xffff -; SI-NEXT: s_lshl_b32 s13, s40, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s29, 0xffff -; SI-NEXT: s_lshl_b32 s13, s75, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_lshl_b32 s10, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 @@ -3019,7 +3047,7 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_lshl_b32 s8, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3033,7 +3061,7 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_lshl_b32 s6, s24, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3064,18 +3092,46 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v20i32_to_v40i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v15, s24 +; VI-NEXT: v_mov_b32_e32 v16, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v7 +; VI-NEXT: v_mov_b32_e32 v7, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s24, v8 +; VI-NEXT: v_readfirstlane_b32 s23, v9 +; VI-NEXT: v_readfirstlane_b32 s22, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v11 +; VI-NEXT: v_readfirstlane_b32 s20, v12 +; VI-NEXT: v_readfirstlane_b32 s19, v13 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v15 +; VI-NEXT: v_readfirstlane_b32 s16, v16 +; VI-NEXT: v_readfirstlane_b32 s15, v17 +; VI-NEXT: v_readfirstlane_b32 s14, v18 +; VI-NEXT: v_readfirstlane_b32 s13, v19 +; VI-NEXT: v_readfirstlane_b32 s12, v7 ; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: v_readfirstlane_b32 s9, v2 @@ -3085,26 +3141,26 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v5 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -3113,115 +3169,115 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s9, s9, 3 ; VI-NEXT: s_add_i32 s10, s10, 3 ; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s5, s63, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s62, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s60, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s59, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s58, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s57, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s56, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s47, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s24 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s24, s61, 16 ; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s45, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s44, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s43, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s24, s60, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s24, s59, 16 +; VI-NEXT: s_or_b32 s21, s21, s24 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s20, s20, s24 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s24, s57, 16 +; VI-NEXT: s_or_b32 s19, s19, s24 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s24, s56, 16 +; VI-NEXT: s_or_b32 s18, s18, s24 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s24, s47, 16 +; VI-NEXT: s_or_b32 s17, s17, s24 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s16, s16, s24 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s24, s45, 16 +; VI-NEXT: s_or_b32 s15, s15, s24 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s24, s44, 16 +; VI-NEXT: s_or_b32 s14, s14, s24 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s24, s43, 16 +; VI-NEXT: s_or_b32 s13, s13, s24 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s12, s12, s24 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s24, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s24 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s24, s40, 16 +; VI-NEXT: s_or_b32 s10, s10, s24 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s24, s29, 16 +; VI-NEXT: s_or_b32 s9, s9, s24 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_lshl_b32 s24, s28, 16 +; VI-NEXT: s_or_b32 s8, s8, s24 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s24, s27, 16 +; VI-NEXT: s_or_b32 s6, s6, s24 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 -; VI-NEXT: s_or_b32 s9, s9, s15 -; VI-NEXT: s_or_b32 s8, s8, s14 -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: s_lshl_b32 s24, s26, 16 +; VI-NEXT: s_or_b32 s7, s7, s24 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s12 ; VI-NEXT: v_mov_b32_e32 v14, s11 ; VI-NEXT: v_mov_b32_e32 v15, s10 ; VI-NEXT: v_mov_b32_e32 v16, s9 @@ -3246,57 +3302,75 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v20i32_to_v40i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v16, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_readfirstlane_b32 s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v9 +; GFX9-NEXT: v_readfirstlane_b32 s9, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v13 +; GFX9-NEXT: v_readfirstlane_b32 s13, v14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v15 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v17 +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v19 +; GFX9-NEXT: v_readfirstlane_b32 s19, v7 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_add_i32 s23, s23, 3 @@ -3307,67 +3381,77 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s26 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr63 @@ -3386,38 +3470,65 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v20i32_to_v40i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v10, s19 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_dual_mov_b32 v13, s22 :: v_dual_mov_b32 v14, s23 +; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 +; GFX11-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-NEXT: v_dual_mov_b32 v19, s28 :: v_dual_mov_b32 v20, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 +; GFX11-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v9 +; GFX11-NEXT: v_readfirstlane_b32 s7, v10 +; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s9, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v13 +; GFX11-NEXT: v_readfirstlane_b32 s11, v14 +; GFX11-NEXT: v_readfirstlane_b32 s12, v15 +; GFX11-NEXT: v_readfirstlane_b32 s13, v16 +; GFX11-NEXT: v_readfirstlane_b32 s14, v17 +; GFX11-NEXT: v_readfirstlane_b32 s15, v18 +; GFX11-NEXT: v_readfirstlane_b32 s16, v19 +; GFX11-NEXT: v_readfirstlane_b32 s17, v20 +; GFX11-NEXT: v_readfirstlane_b32 s19, v0 +; GFX11-NEXT: v_readfirstlane_b32 s18, v1 ; GFX11-NEXT: s_mov_b32 s58, 0 -; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s20, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -3425,42 +3536,42 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 -; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -3471,32 +3582,32 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s56 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s47 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s22, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s23, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s24, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s25, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s26, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s27, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s28, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s29, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s22 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s21 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s20 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 -; GFX11-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 -; GFX11-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 -; GFX11-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v19, s18 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr57 @@ -3509,16 +3620,16 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 +; GFX11-NEXT: ; implicit-def: $sgpr21 +; GFX11-NEXT: ; implicit-def: $sgpr20 ; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3823,7 +3934,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -6094,7 +6205,35 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-LABEL: bitcast_v20i32_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s25, v10 +; SI-NEXT: v_readfirstlane_b32 s22, v11 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s17, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s15, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 ; SI-NEXT: v_readfirstlane_b32 s11, v1 ; SI-NEXT: v_readfirstlane_b32 s10, v2 ; SI-NEXT: v_readfirstlane_b32 s8, v3 @@ -6116,33 +6255,33 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 @@ -6150,56 +6289,56 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: s_lshr_b32 s13, s19, 16 -; SI-NEXT: s_lshr_b32 s14, s20, 16 -; SI-NEXT: s_lshr_b32 s15, s21, 16 -; SI-NEXT: s_lshr_b32 s40, s22, 16 -; SI-NEXT: s_lshr_b32 s41, s23, 16 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 16 -; SI-NEXT: s_lshr_b32 s44, s26, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s5, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: s_lshr_b32 s44, s15, 16 +; SI-NEXT: s_lshr_b32 s45, s14, 16 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 ; SI-NEXT: s_lshr_b32 s56, s11, 16 ; SI-NEXT: s_lshr_b32 s57, s10, 16 ; SI-NEXT: s_lshr_b32 s58, s8, 16 @@ -6212,20 +6351,20 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 @@ -6240,10 +6379,10 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: .LBB17_3: ; %end @@ -6433,7 +6572,35 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v15, s24 +; VI-NEXT: v_mov_b32_e32 v16, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v7 +; VI-NEXT: v_mov_b32_e32 v7, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s24, v8 +; VI-NEXT: v_readfirstlane_b32 s23, v9 +; VI-NEXT: v_readfirstlane_b32 s22, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v11 +; VI-NEXT: v_readfirstlane_b32 s20, v12 +; VI-NEXT: v_readfirstlane_b32 s19, v13 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v15 +; VI-NEXT: v_readfirstlane_b32 s16, v16 +; VI-NEXT: v_readfirstlane_b32 s15, v17 +; VI-NEXT: v_readfirstlane_b32 s14, v18 +; VI-NEXT: v_readfirstlane_b32 s13, v19 +; VI-NEXT: v_readfirstlane_b32 s12, v7 ; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: v_readfirstlane_b32 s9, v2 @@ -6443,26 +6610,26 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v5 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -6471,115 +6638,115 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s9, s9, 3 ; VI-NEXT: s_add_i32 s10, s10, 3 ; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s5, s63, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s62, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s60, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s59, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s58, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s57, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s56, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s47, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s24 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s24, s61, 16 ; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s45, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s44, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s43, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s24, s60, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s24, s59, 16 +; VI-NEXT: s_or_b32 s21, s21, s24 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s20, s20, s24 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s24, s57, 16 +; VI-NEXT: s_or_b32 s19, s19, s24 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s24, s56, 16 +; VI-NEXT: s_or_b32 s18, s18, s24 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s24, s47, 16 +; VI-NEXT: s_or_b32 s17, s17, s24 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s16, s16, s24 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s24, s45, 16 +; VI-NEXT: s_or_b32 s15, s15, s24 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s24, s44, 16 +; VI-NEXT: s_or_b32 s14, s14, s24 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s24, s43, 16 +; VI-NEXT: s_or_b32 s13, s13, s24 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s12, s12, s24 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s24, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s24 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s24, s40, 16 +; VI-NEXT: s_or_b32 s10, s10, s24 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s24, s29, 16 +; VI-NEXT: s_or_b32 s9, s9, s24 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_lshl_b32 s24, s28, 16 +; VI-NEXT: s_or_b32 s8, s8, s24 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s24, s27, 16 +; VI-NEXT: s_or_b32 s6, s6, s24 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 -; VI-NEXT: s_or_b32 s9, s9, s15 -; VI-NEXT: s_or_b32 s8, s8, s14 -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: s_lshl_b32 s24, s26, 16 +; VI-NEXT: s_or_b32 s7, s7, s24 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s12 ; VI-NEXT: v_mov_b32_e32 v14, s11 ; VI-NEXT: v_mov_b32_e32 v15, s10 ; VI-NEXT: v_mov_b32_e32 v16, s9 @@ -6604,57 +6771,75 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v20i32_to_v40f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v16, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_readfirstlane_b32 s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v9 +; GFX9-NEXT: v_readfirstlane_b32 s9, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v13 +; GFX9-NEXT: v_readfirstlane_b32 s13, v14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v15 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v17 +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v19 +; GFX9-NEXT: v_readfirstlane_b32 s19, v7 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 ; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_add_i32 s27, s27, 3 -; GFX9-NEXT: s_add_i32 s26, s26, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_add_i32 s23, s23, 3 @@ -6665,67 +6850,77 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s26 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: ; implicit-def: $sgpr63 @@ -6744,38 +6939,65 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB17_2 ; ; GFX11-LABEL: bitcast_v20i32_to_v40f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v10, s19 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_dual_mov_b32 v13, s22 :: v_dual_mov_b32 v14, s23 +; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 +; GFX11-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-NEXT: v_dual_mov_b32 v19, s28 :: v_dual_mov_b32 v20, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 +; GFX11-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v9 +; GFX11-NEXT: v_readfirstlane_b32 s7, v10 +; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s9, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v13 +; GFX11-NEXT: v_readfirstlane_b32 s11, v14 +; GFX11-NEXT: v_readfirstlane_b32 s12, v15 +; GFX11-NEXT: v_readfirstlane_b32 s13, v16 +; GFX11-NEXT: v_readfirstlane_b32 s14, v17 +; GFX11-NEXT: v_readfirstlane_b32 s15, v18 +; GFX11-NEXT: v_readfirstlane_b32 s16, v19 +; GFX11-NEXT: v_readfirstlane_b32 s17, v20 +; GFX11-NEXT: v_readfirstlane_b32 s19, v0 +; GFX11-NEXT: v_readfirstlane_b32 s18, v1 ; GFX11-NEXT: s_mov_b32 s58, 0 -; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s20, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -6783,42 +7005,42 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 -; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -6829,32 +7051,32 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s56 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s47 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s22, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s23, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s24, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s25, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s26, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s27, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s28, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s29, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s22 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s21 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s20 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 -; GFX11-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 -; GFX11-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 -; GFX11-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v19, s18 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr57 @@ -6867,16 +7089,16 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 +; GFX11-NEXT: ; implicit-def: $sgpr21 +; GFX11-NEXT: ; implicit-def: $sgpr20 ; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12019,7 +12241,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -14271,316 +14493,348 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, s16 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v58, s18 +; SI-NEXT: v_mov_b32_e32 v57, s19 +; SI-NEXT: v_mov_b32_e32 v56, s20 +; SI-NEXT: v_mov_b32_e32 v47, s21 +; SI-NEXT: v_mov_b32_e32 v46, s22 +; SI-NEXT: v_mov_b32_e32 v45, s23 +; SI-NEXT: v_mov_b32_e32 v44, s24 +; SI-NEXT: v_mov_b32_e32 v41, s25 +; SI-NEXT: v_mov_b32_e32 v40, s26 +; SI-NEXT: v_mov_b32_e32 v55, s27 +; SI-NEXT: v_mov_b32_e32 v42, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 -; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v60 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v57 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v46 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v41 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v40 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -14590,12 +14844,12 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -14604,15 +14858,9 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: @@ -18539,36 +18787,64 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v10i64_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v8 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v9 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v10 +; SI-NEXT: v_readfirstlane_b32 s21, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s12, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v19 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 ; SI-NEXT: v_readfirstlane_b32 s8, v1 ; SI-NEXT: v_readfirstlane_b32 s9, v2 ; SI-NEXT: v_readfirstlane_b32 s6, v3 ; SI-NEXT: v_readfirstlane_b32 s7, v4 ; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_and_b64 s[24:25], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s73, s7, 16 ; SI-NEXT: s_lshr_b32 s74, s9, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s27, 16 -; SI-NEXT: s_lshr_b32 s77, s25, 16 -; SI-NEXT: s_lshr_b32 s78, s23, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s17, 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -18577,139 +18853,139 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s7, s7, 0 ; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s73, s7, 16 ; SI-NEXT: s_lshr_b32 s74, s9, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s27, 16 -; SI-NEXT: s_lshr_b32 s77, s25, 16 -; SI-NEXT: s_lshr_b32 s78, s23, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s17, 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s11, s60, 16 -; SI-NEXT: s_and_b32 s13, s16, 0xffff -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b32 s11, s17, 0xffff -; SI-NEXT: s_lshl_b32 s13, s89, 16 -; SI-NEXT: s_or_b32 s11, s11, s13 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_lshl_b32 s11, s58, 16 -; SI-NEXT: s_and_b32 s13, s18, 0xffff -; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s89, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s22, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s19, 0xffff -; SI-NEXT: s_lshl_b32 s13, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s20, 0xffff -; SI-NEXT: s_lshl_b32 s13, s56, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s21, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s79, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s22, 0xffff -; SI-NEXT: s_lshl_b32 s13, s46, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s23, 0xffff -; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s24, 0xffff -; SI-NEXT: s_lshl_b32 s13, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s25, 0xffff -; SI-NEXT: s_lshl_b32 s13, s77, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s77, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s26, 0xffff -; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff ; SI-NEXT: s_lshl_b32 s13, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s28, 0xffff -; SI-NEXT: s_lshl_b32 s13, s40, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_and_b32 s11, s29, 0xffff -; SI-NEXT: s_lshl_b32 s13, s75, 16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_lshl_b32 s10, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 @@ -18721,7 +18997,7 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_lshl_b32 s8, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -18735,7 +19011,7 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_lshl_b32 s6, s24, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -18766,18 +19042,46 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v10i64_to_v40i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v15, s24 +; VI-NEXT: v_mov_b32_e32 v16, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v7 +; VI-NEXT: v_mov_b32_e32 v7, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s24, v8 +; VI-NEXT: v_readfirstlane_b32 s23, v9 +; VI-NEXT: v_readfirstlane_b32 s22, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v11 +; VI-NEXT: v_readfirstlane_b32 s20, v12 +; VI-NEXT: v_readfirstlane_b32 s19, v13 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v15 +; VI-NEXT: v_readfirstlane_b32 s16, v16 +; VI-NEXT: v_readfirstlane_b32 s15, v17 +; VI-NEXT: v_readfirstlane_b32 s14, v18 +; VI-NEXT: v_readfirstlane_b32 s13, v19 +; VI-NEXT: v_readfirstlane_b32 s12, v7 ; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: v_readfirstlane_b32 s9, v2 @@ -18787,26 +19091,26 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v5 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -18815,115 +19119,115 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: s_add_u32 s11, s11, 3 ; VI-NEXT: s_addc_u32 s10, s10, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s5, s63, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s62, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s60, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s59, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s58, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s57, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s56, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s47, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s24 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s24, s61, 16 ; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s45, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s44, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s43, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s24, s60, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s24, s59, 16 +; VI-NEXT: s_or_b32 s21, s21, s24 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s20, s20, s24 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s24, s57, 16 +; VI-NEXT: s_or_b32 s19, s19, s24 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s24, s56, 16 +; VI-NEXT: s_or_b32 s18, s18, s24 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s24, s47, 16 +; VI-NEXT: s_or_b32 s17, s17, s24 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s16, s16, s24 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s24, s45, 16 +; VI-NEXT: s_or_b32 s15, s15, s24 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s24, s44, 16 +; VI-NEXT: s_or_b32 s14, s14, s24 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s24, s43, 16 +; VI-NEXT: s_or_b32 s13, s13, s24 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s12, s12, s24 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s24, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s24 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s24, s40, 16 +; VI-NEXT: s_or_b32 s10, s10, s24 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s24, s29, 16 +; VI-NEXT: s_or_b32 s9, s9, s24 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_lshl_b32 s24, s28, 16 +; VI-NEXT: s_or_b32 s8, s8, s24 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s24, s27, 16 +; VI-NEXT: s_or_b32 s6, s6, s24 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 -; VI-NEXT: s_or_b32 s9, s9, s15 -; VI-NEXT: s_or_b32 s8, s8, s14 -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: s_lshl_b32 s24, s26, 16 +; VI-NEXT: s_or_b32 s7, s7, s24 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s12 ; VI-NEXT: v_mov_b32_e32 v14, s11 ; VI-NEXT: v_mov_b32_e32 v15, s10 ; VI-NEXT: v_mov_b32_e32 v16, s9 @@ -18948,57 +19252,75 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v10i64_to_v40i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v16, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_readfirstlane_b32 s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v9 +; GFX9-NEXT: v_readfirstlane_b32 s9, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v13 +; GFX9-NEXT: v_readfirstlane_b32 s13, v14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v15 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v17 +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v19 +; GFX9-NEXT: v_readfirstlane_b32 s19, v7 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 ; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 ; GFX9-NEXT: s_add_u32 s24, s24, 3 ; GFX9-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-NEXT: s_add_u32 s22, s22, 3 @@ -19009,67 +19331,77 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s26 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB41_4: ; GFX9-NEXT: ; implicit-def: $sgpr63 @@ -19088,38 +19420,65 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB41_2 ; ; GFX11-LABEL: bitcast_v10i64_to_v40i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v10, s19 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_dual_mov_b32 v13, s22 :: v_dual_mov_b32 v14, s23 +; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 +; GFX11-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-NEXT: v_dual_mov_b32 v19, s28 :: v_dual_mov_b32 v20, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 +; GFX11-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v9 +; GFX11-NEXT: v_readfirstlane_b32 s7, v10 +; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s9, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v13 +; GFX11-NEXT: v_readfirstlane_b32 s11, v14 +; GFX11-NEXT: v_readfirstlane_b32 s12, v15 +; GFX11-NEXT: v_readfirstlane_b32 s13, v16 +; GFX11-NEXT: v_readfirstlane_b32 s14, v17 +; GFX11-NEXT: v_readfirstlane_b32 s15, v18 +; GFX11-NEXT: v_readfirstlane_b32 s16, v19 +; GFX11-NEXT: v_readfirstlane_b32 s17, v20 +; GFX11-NEXT: v_readfirstlane_b32 s19, v0 +; GFX11-NEXT: v_readfirstlane_b32 s18, v1 ; GFX11-NEXT: s_mov_b32 s58, 0 -; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s20, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -19127,42 +19486,42 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s5, s5, 3 -; GFX11-NEXT: s_addc_u32 s4, s4, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 -; GFX11-NEXT: s_add_u32 s20, s20, 3 -; GFX11-NEXT: s_addc_u32 s21, s21, 0 -; GFX11-NEXT: s_add_u32 s18, s18, 3 -; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s19, s19, 3 +; GFX11-NEXT: s_addc_u32 s18, s18, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -19173,32 +19532,32 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s56 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s47 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s22, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s23, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s24, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s25, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s26, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s27, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s28, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s29, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s22 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s21 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s20 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 -; GFX11-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 -; GFX11-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 -; GFX11-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v19, s18 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr57 @@ -19211,16 +19570,16 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 +; GFX11-NEXT: ; implicit-def: $sgpr21 +; GFX11-NEXT: ; implicit-def: $sgpr20 ; GFX11-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19525,7 +19884,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -21806,7 +22165,35 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-LABEL: bitcast_v10i64_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v8 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_readfirstlane_b32 s25, v9 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s23, v10 +; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_readfirstlane_b32 s20, v12 +; SI-NEXT: v_readfirstlane_b32 s21, v13 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v15 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_readfirstlane_b32 s17, v17 +; SI-NEXT: v_readfirstlane_b32 s14, v18 +; SI-NEXT: v_readfirstlane_b32 s15, v19 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 ; SI-NEXT: v_readfirstlane_b32 s10, v1 ; SI-NEXT: v_readfirstlane_b32 s11, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v3 @@ -21828,33 +22215,33 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 @@ -21862,50 +22249,50 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s12, s4, 16 -; SI-NEXT: s_lshr_b32 s13, s5, 16 -; SI-NEXT: s_add_u32 s14, s18, 3 -; SI-NEXT: s_addc_u32 s15, s19, 0 -; SI-NEXT: s_lshr_b32 s16, s14, 16 -; SI-NEXT: s_lshr_b32 s17, s15, 16 -; SI-NEXT: s_add_u32 s18, s20, 3 -; SI-NEXT: s_addc_u32 s19, s21, 0 -; SI-NEXT: s_lshr_b32 s20, s18, 16 -; SI-NEXT: s_lshr_b32 s21, s19, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s40, s22, 16 -; SI-NEXT: s_lshr_b32 s41, s23, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s44, s26, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_addc_u32 s5, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s4, 16 +; SI-NEXT: s_lshr_b32 s25, s5, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s24, s24, 0 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s24, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s28, s20, 16 +; SI-NEXT: s_lshr_b32 s29, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s42, s16, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s46, s12, 16 +; SI-NEXT: s_lshr_b32 s47, s13, 16 ; SI-NEXT: s_add_u32 s10, s10, 3 ; SI-NEXT: s_addc_u32 s11, s11, 0 ; SI-NEXT: s_lshr_b32 s56, s10, 16 @@ -21924,18 +22311,18 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 @@ -21952,12 +22339,12 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 @@ -22145,7 +22532,35 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v15, s24 +; VI-NEXT: v_mov_b32_e32 v16, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v7 +; VI-NEXT: v_mov_b32_e32 v7, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s24, v8 +; VI-NEXT: v_readfirstlane_b32 s23, v9 +; VI-NEXT: v_readfirstlane_b32 s22, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v11 +; VI-NEXT: v_readfirstlane_b32 s20, v12 +; VI-NEXT: v_readfirstlane_b32 s19, v13 +; VI-NEXT: v_readfirstlane_b32 s18, v14 +; VI-NEXT: v_readfirstlane_b32 s17, v15 +; VI-NEXT: v_readfirstlane_b32 s16, v16 +; VI-NEXT: v_readfirstlane_b32 s15, v17 +; VI-NEXT: v_readfirstlane_b32 s14, v18 +; VI-NEXT: v_readfirstlane_b32 s13, v19 +; VI-NEXT: v_readfirstlane_b32 s12, v7 ; VI-NEXT: v_readfirstlane_b32 s11, v0 ; VI-NEXT: v_readfirstlane_b32 s10, v1 ; VI-NEXT: v_readfirstlane_b32 s9, v2 @@ -22155,26 +22570,26 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v5 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -22183,115 +22598,115 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: s_add_u32 s11, s11, 3 ; VI-NEXT: s_addc_u32 s10, s10, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s13, s6, 16 -; VI-NEXT: s_lshr_b32 s14, s8, 16 -; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_lshr_b32 s26, s7, 16 +; VI-NEXT: s_lshr_b32 s27, s6, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s9, 16 ; VI-NEXT: s_lshr_b32 s40, s10, 16 ; VI-NEXT: s_lshr_b32 s41, s11, 16 -; VI-NEXT: s_lshr_b32 s42, s29, 16 -; VI-NEXT: s_lshr_b32 s43, s28, 16 -; VI-NEXT: s_lshr_b32 s44, s27, 16 -; VI-NEXT: s_lshr_b32 s45, s26, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: s_lshr_b32 s56, s23, 16 -; VI-NEXT: s_lshr_b32 s57, s22, 16 -; VI-NEXT: s_lshr_b32 s58, s21, 16 -; VI-NEXT: s_lshr_b32 s59, s20, 16 -; VI-NEXT: s_lshr_b32 s60, s19, 16 -; VI-NEXT: s_lshr_b32 s61, s18, 16 -; VI-NEXT: s_lshr_b32 s62, s17, 16 -; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_lshr_b32 s42, s12, 16 +; VI-NEXT: s_lshr_b32 s43, s13, 16 +; VI-NEXT: s_lshr_b32 s44, s14, 16 +; VI-NEXT: s_lshr_b32 s45, s15, 16 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 16 +; VI-NEXT: s_lshr_b32 s57, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 16 +; VI-NEXT: s_lshr_b32 s59, s21, 16 +; VI-NEXT: s_lshr_b32 s60, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s62, s24, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s5, s63, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s62, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s61, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s60, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s59, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s58, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s57, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s56, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s47, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s24 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s24, s61, 16 ; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s45, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s44, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s43, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s24, s60, 16 +; VI-NEXT: s_or_b32 s22, s22, s24 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s24, s59, 16 +; VI-NEXT: s_or_b32 s21, s21, s24 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s20, s20, s24 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s24, s57, 16 +; VI-NEXT: s_or_b32 s19, s19, s24 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s24, s56, 16 +; VI-NEXT: s_or_b32 s18, s18, s24 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s24, s47, 16 +; VI-NEXT: s_or_b32 s17, s17, s24 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s16, s16, s24 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s24, s45, 16 +; VI-NEXT: s_or_b32 s15, s15, s24 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s24, s44, 16 +; VI-NEXT: s_or_b32 s14, s14, s24 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s24, s43, 16 +; VI-NEXT: s_or_b32 s13, s13, s24 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s12, s12, s24 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s24, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s24 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s24, s40, 16 +; VI-NEXT: s_or_b32 s10, s10, s24 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s24, s29, 16 +; VI-NEXT: s_or_b32 s9, s9, s24 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_lshl_b32 s24, s28, 16 +; VI-NEXT: s_or_b32 s8, s8, s24 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_lshl_b32 s24, s27, 16 +; VI-NEXT: s_or_b32 s6, s6, s24 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s12, s12, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 -; VI-NEXT: s_or_b32 s9, s9, s15 -; VI-NEXT: s_or_b32 s8, s8, s14 -; VI-NEXT: s_or_b32 s6, s6, s13 -; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: s_lshl_b32 s24, s26, 16 +; VI-NEXT: s_or_b32 s7, s7, s24 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s12 ; VI-NEXT: v_mov_b32_e32 v14, s11 ; VI-NEXT: v_mov_b32_e32 v15, s10 ; VI-NEXT: v_mov_b32_e32 v16, s9 @@ -22316,57 +22731,75 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v10i64_to_v40f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v16, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_readfirstlane_b32 s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v9 +; GFX9-NEXT: v_readfirstlane_b32 s9, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v13 +; GFX9-NEXT: v_readfirstlane_b32 s13, v14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v15 +; GFX9-NEXT: v_readfirstlane_b32 s15, v16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v17 +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 +; GFX9-NEXT: v_readfirstlane_b32 s18, v19 +; GFX9-NEXT: v_readfirstlane_b32 s19, v7 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 ; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 -; GFX9-NEXT: s_add_u32 s26, s26, 3 -; GFX9-NEXT: s_addc_u32 s27, s27, 0 ; GFX9-NEXT: s_add_u32 s24, s24, 3 ; GFX9-NEXT: s_addc_u32 s25, s25, 0 ; GFX9-NEXT: s_add_u32 s22, s22, 3 @@ -22377,67 +22810,77 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_lshr_b32 s40, s7, 16 -; GFX9-NEXT: s_lshr_b32 s41, s6, 16 -; GFX9-NEXT: s_lshr_b32 s42, s29, 16 -; GFX9-NEXT: s_lshr_b32 s43, s28, 16 -; GFX9-NEXT: s_lshr_b32 s44, s27, 16 -; GFX9-NEXT: s_lshr_b32 s45, s26, 16 -; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: s_lshr_b32 s47, s24, 16 -; GFX9-NEXT: s_lshr_b32 s56, s23, 16 -; GFX9-NEXT: s_lshr_b32 s57, s22, 16 -; GFX9-NEXT: s_lshr_b32 s58, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s20, 16 -; GFX9-NEXT: s_lshr_b32 s60, s19, 16 -; GFX9-NEXT: s_lshr_b32 s61, s18, 16 -; GFX9-NEXT: s_lshr_b32 s62, s17, 16 -; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s27, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s23, 16 +; GFX9-NEXT: s_lshr_b32 s29, s22, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 16 +; GFX9-NEXT: s_lshr_b32 s42, s19, 16 +; GFX9-NEXT: s_lshr_b32 s43, s18, 16 +; GFX9-NEXT: s_lshr_b32 s44, s17, 16 +; GFX9-NEXT: s_lshr_b32 s45, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_lshr_b32 s47, s14, 16 +; GFX9-NEXT: s_lshr_b32 s56, s13, 16 +; GFX9-NEXT: s_lshr_b32 s57, s12, 16 +; GFX9-NEXT: s_lshr_b32 s58, s11, 16 +; GFX9-NEXT: s_lshr_b32 s59, s10, 16 +; GFX9-NEXT: s_lshr_b32 s60, s9, 16 +; GFX9-NEXT: s_lshr_b32 s61, s8, 16 +; GFX9-NEXT: s_lshr_b32 s62, s7, 16 +; GFX9-NEXT: s_lshr_b32 s63, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s28 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s27 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s26 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB45_4: ; GFX9-NEXT: ; implicit-def: $sgpr63 @@ -22456,38 +22899,65 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB45_2 ; ; GFX11-LABEL: bitcast_v10i64_to_v40f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v10, s19 +; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 +; GFX11-NEXT: v_dual_mov_b32 v13, s22 :: v_dual_mov_b32 v14, s23 +; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 +; GFX11-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-NEXT: v_dual_mov_b32 v19, s28 :: v_dual_mov_b32 v20, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 +; GFX11-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v9 +; GFX11-NEXT: v_readfirstlane_b32 s7, v10 +; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s9, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v13 +; GFX11-NEXT: v_readfirstlane_b32 s11, v14 +; GFX11-NEXT: v_readfirstlane_b32 s12, v15 +; GFX11-NEXT: v_readfirstlane_b32 s13, v16 +; GFX11-NEXT: v_readfirstlane_b32 s14, v17 +; GFX11-NEXT: v_readfirstlane_b32 s15, v18 +; GFX11-NEXT: v_readfirstlane_b32 s16, v19 +; GFX11-NEXT: v_readfirstlane_b32 s17, v20 +; GFX11-NEXT: v_readfirstlane_b32 s19, v0 +; GFX11-NEXT: v_readfirstlane_b32 s18, v1 ; GFX11-NEXT: s_mov_b32 s58, 0 -; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s20, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -22495,42 +22965,42 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s5, s5, 3 -; GFX11-NEXT: s_addc_u32 s4, s4, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 -; GFX11-NEXT: s_add_u32 s20, s20, 3 -; GFX11-NEXT: s_addc_u32 s21, s21, 0 -; GFX11-NEXT: s_add_u32 s18, s18, 3 -; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s19, s19, 3 +; GFX11-NEXT: s_addc_u32 s18, s18, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 -; GFX11-NEXT: s_lshr_b32 s7, s5, 16 -; GFX11-NEXT: s_lshr_b32 s8, s29, 16 -; GFX11-NEXT: s_lshr_b32 s9, s28, 16 -; GFX11-NEXT: s_lshr_b32 s10, s27, 16 -; GFX11-NEXT: s_lshr_b32 s11, s26, 16 -; GFX11-NEXT: s_lshr_b32 s12, s25, 16 -; GFX11-NEXT: s_lshr_b32 s13, s24, 16 -; GFX11-NEXT: s_lshr_b32 s14, s23, 16 -; GFX11-NEXT: s_lshr_b32 s15, s22, 16 -; GFX11-NEXT: s_lshr_b32 s40, s21, 16 -; GFX11-NEXT: s_lshr_b32 s41, s20, 16 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s44, s17, 16 -; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-NEXT: s_lshr_b32 s21, s19, 16 +; GFX11-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-NEXT: s_lshr_b32 s23, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s15, 16 +; GFX11-NEXT: s_lshr_b32 s25, s14, 16 +; GFX11-NEXT: s_lshr_b32 s26, s13, 16 +; GFX11-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-NEXT: s_lshr_b32 s28, s11, 16 +; GFX11-NEXT: s_lshr_b32 s29, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s9, 16 +; GFX11-NEXT: s_lshr_b32 s41, s8, 16 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s43, s6, 16 +; GFX11-NEXT: s_lshr_b32 s44, s5, 16 +; GFX11-NEXT: s_lshr_b32 s45, s4, 16 ; GFX11-NEXT: s_lshr_b32 s46, s3, 16 ; GFX11-NEXT: s_lshr_b32 s47, s2, 16 ; GFX11-NEXT: s_lshr_b32 s56, s1, 16 @@ -22541,32 +23011,32 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s56 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s47 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s22, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s23, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s24, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s25, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s26, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s27, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s28, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s29, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s22 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s21 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s20 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 -; GFX11-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 -; GFX11-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 -; GFX11-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v19, s18 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr57 @@ -22579,16 +23049,16 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 +; GFX11-NEXT: ; implicit-def: $sgpr21 +; GFX11-NEXT: ; implicit-def: $sgpr20 ; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -26261,7 +26731,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -28473,341 +28943,368 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s10, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_lshr_b32 s10, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; SI-NEXT: s_lshr_b32 s10, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 -; SI-NEXT: s_lshr_b32 s10, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: s_lshr_b32 s10, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: s_lshr_b32 s10, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 -; SI-NEXT: s_lshr_b32 s10, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: s_lshr_b32 s10, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[34:35], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[29:30], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index ed407c1e20c14..9cff9c4a9dc65 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3021,7 +3021,35 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v22i32_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v10 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s13, v13 ; SI-NEXT: v_readfirstlane_b32 s10, v1 ; SI-NEXT: v_readfirstlane_b32 s11, v2 ; SI-NEXT: v_readfirstlane_b32 s8, v3 @@ -3029,7 +3057,7 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_readfirstlane_b32 s7, v6 ; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v8 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -3037,40 +3065,40 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s77, s7, 16 ; SI-NEXT: s_lshr_b32 s78, s9, 16 ; SI-NEXT: s_lshr_b32 s79, s11, 16 -; SI-NEXT: s_lshr_b32 s88, s29, 16 -; SI-NEXT: s_lshr_b32 s89, s27, 16 -; SI-NEXT: s_lshr_b32 s90, s25, 16 -; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 ; SI-NEXT: s_lshr_b32 s92, s21, 16 -; SI-NEXT: s_lshr_b32 s93, s19, 16 -; SI-NEXT: s_lshr_b32 s94, s17, 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 @@ -3079,125 +3107,125 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 ; SI-NEXT: s_lshr_b32 s76, s5, 16 ; SI-NEXT: s_lshr_b32 s77, s7, 16 ; SI-NEXT: s_lshr_b32 s78, s9, 16 ; SI-NEXT: s_lshr_b32 s79, s11, 16 -; SI-NEXT: s_lshr_b32 s88, s29, 16 -; SI-NEXT: s_lshr_b32 s89, s27, 16 -; SI-NEXT: s_lshr_b32 s90, s25, 16 -; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 ; SI-NEXT: s_lshr_b32 s92, s21, 16 -; SI-NEXT: s_lshr_b32 s93, s19, 16 -; SI-NEXT: s_lshr_b32 s94, s17, 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s13, s72, 16 -; SI-NEXT: s_and_b32 s15, s16, 0xffff -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_and_b32 s13, s17, 0xffff -; SI-NEXT: s_lshl_b32 s15, s94, 16 -; SI-NEXT: s_or_b32 s13, s13, s15 -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_lshl_b32 s13, s62, 16 -; SI-NEXT: s_and_b32 s15, s18, 0xffff -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: s_and_b32 s13, s19, 0xffff -; SI-NEXT: s_lshl_b32 s15, s93, 16 -; SI-NEXT: s_or_b32 s13, s13, s15 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: s_lshl_b32 s13, s60, 16 -; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s94, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s93, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s21, 0xffff -; SI-NEXT: s_lshl_b32 s15, s92, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s92, 16 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s22, 0xffff -; SI-NEXT: s_lshl_b32 s15, s58, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s23, 0xffff -; SI-NEXT: s_lshl_b32 s15, s91, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s91, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s24, 0xffff -; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s90, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s26, 0xffff -; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff ; SI-NEXT: s_lshl_b32 s15, s89, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s28, 0xffff -; SI-NEXT: s_lshl_b32 s15, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s29, 0xffff -; SI-NEXT: s_lshl_b32 s15, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_lshl_b32 s12, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s10, s10, s13 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 @@ -3223,7 +3251,7 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3237,7 +3265,7 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3272,16 +3300,44 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v22i32_to_v44i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v17, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v10 +; VI-NEXT: v_mov_b32_e32 v10, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v11 +; VI-NEXT: v_mov_b32_e32 v11, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s24, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v13 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_readfirstlane_b32 s21, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s19, v17 +; VI-NEXT: v_readfirstlane_b32 s18, v18 +; VI-NEXT: v_readfirstlane_b32 s17, v19 +; VI-NEXT: v_readfirstlane_b32 s16, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 ; VI-NEXT: v_readfirstlane_b32 s13, v0 ; VI-NEXT: v_readfirstlane_b32 s12, v1 ; VI-NEXT: v_readfirstlane_b32 s11, v2 @@ -3293,28 +3349,28 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v7 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -3325,123 +3381,123 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s11, s11, 3 ; VI-NEXT: s_add_i32 s12, s12, 3 ; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s40 ; VI-NEXT: s_lshl_b32 s5, s75, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s74, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s73, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s72, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s63, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s62, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s61, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s60, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s59, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s58, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s57, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s26, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s26 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s26, s73, 16 ; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s47, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s26, s63, 16 +; VI-NEXT: s_or_b32 s23, s23, s26 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s26, s62, 16 +; VI-NEXT: s_or_b32 s22, s22, s26 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s26, s61, 16 +; VI-NEXT: s_or_b32 s21, s21, s26 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s20, s20, s26 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s26, s59, 16 +; VI-NEXT: s_or_b32 s19, s19, s26 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s26, s58, 16 +; VI-NEXT: s_or_b32 s18, s18, s26 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s26, s57, 16 +; VI-NEXT: s_or_b32 s17, s17, s26 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s16, s16, s26 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s26, s47, 16 +; VI-NEXT: s_or_b32 s15, s15, s26 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s26, s46, 16 +; VI-NEXT: s_or_b32 s14, s14, s26 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s26, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s26 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s26 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s26, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s26 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s26, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s26 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s26, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s26 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s26, s29, 16 +; VI-NEXT: s_or_b32 s8, s8, s26 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s26, s28, 16 +; VI-NEXT: s_or_b32 s6, s6, s26 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 -; VI-NEXT: s_or_b32 s6, s6, s15 -; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_lshl_b32 s26, s27, 16 +; VI-NEXT: s_or_b32 s7, s7, s26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v3, s24 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 ; VI-NEXT: v_mov_b32_e32 v14, s13 ; VI-NEXT: v_mov_b32_e32 v15, s12 ; VI-NEXT: v_mov_b32_e32 v16, s11 @@ -3471,60 +3527,78 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v22i32_to_v44i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s24 +; GFX9-NEXT: v_mov_b32_e32 v18, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_readfirstlane_b32 s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s27 +; GFX9-NEXT: v_readfirstlane_b32 s7, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_readfirstlane_b32 s8, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s9, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s12, v15 +; GFX9-NEXT: v_readfirstlane_b32 s13, v16 +; GFX9-NEXT: v_readfirstlane_b32 s14, v17 +; GFX9-NEXT: v_readfirstlane_b32 s15, v18 +; GFX9-NEXT: v_readfirstlane_b32 s16, v19 +; GFX9-NEXT: v_readfirstlane_b32 s17, v9 +; GFX9-NEXT: v_readfirstlane_b32 s18, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v11 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 ; GFX9-NEXT: s_add_i32 s26, s26, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 @@ -3537,73 +3611,83 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr75 @@ -3626,40 +3710,67 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v22i32_to_v44i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_mov_b32 v10, s17 +; GFX11-NEXT: v_dual_mov_b32 v11, s18 :: v_dual_mov_b32 v12, s19 +; GFX11-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s21 +; GFX11-NEXT: v_dual_mov_b32 v15, s22 :: v_dual_mov_b32 v16, s23 +; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-NEXT: v_dual_mov_b32 v21, s28 :: v_dual_mov_b32 v22, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v13 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_readfirstlane_b32 s10, v15 +; GFX11-NEXT: v_readfirstlane_b32 s11, v16 +; GFX11-NEXT: v_readfirstlane_b32 s12, v17 +; GFX11-NEXT: v_readfirstlane_b32 s13, v18 +; GFX11-NEXT: v_readfirstlane_b32 s14, v19 +; GFX11-NEXT: v_readfirstlane_b32 s15, v20 +; GFX11-NEXT: v_readfirstlane_b32 s16, v21 +; GFX11-NEXT: v_readfirstlane_b32 s17, v22 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: v_readfirstlane_b32 s20, v3 ; GFX11-NEXT: s_mov_b32 s62, 0 -; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s22, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -3667,46 +3778,46 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -3717,35 +3828,35 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s60 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s59 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s26, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s27, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s28, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s29, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s22 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s21 :: v_dual_mov_b32 v21, s20 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr61 @@ -3762,14 +3873,14 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 ; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4128,7 +4239,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload @@ -6618,7 +6729,35 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-LABEL: bitcast_v22i32_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s23, v10 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v13 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s16, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v13 ; SI-NEXT: v_readfirstlane_b32 s13, v1 ; SI-NEXT: v_readfirstlane_b32 s12, v2 ; SI-NEXT: v_readfirstlane_b32 s11, v3 @@ -6646,33 +6785,33 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 @@ -6682,36 +6821,36 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 @@ -6720,20 +6859,20 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: s_lshr_b32 s15, s19, 16 -; SI-NEXT: s_lshr_b32 s40, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s21, 16 -; SI-NEXT: s_lshr_b32 s42, s22, 16 -; SI-NEXT: s_lshr_b32 s43, s23, 16 -; SI-NEXT: s_lshr_b32 s44, s24, 16 -; SI-NEXT: s_lshr_b32 s45, s25, 16 -; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: s_lshr_b32 s47, s27, 16 -; SI-NEXT: s_lshr_b32 s56, s28, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s5, s24, 16 +; SI-NEXT: s_lshr_b32 s28, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s22, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s43, s20, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s18, 16 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: s_lshr_b32 s47, s16, 16 +; SI-NEXT: s_lshr_b32 s56, s15, 16 +; SI-NEXT: s_lshr_b32 s57, s14, 16 ; SI-NEXT: s_lshr_b32 s58, s13, 16 ; SI-NEXT: s_lshr_b32 s59, s12, 16 ; SI-NEXT: s_lshr_b32 s60, s11, 16 @@ -6750,20 +6889,20 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 @@ -6782,8 +6921,8 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: .LBB17_3: ; %end @@ -6991,7 +7130,35 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v17, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v10 +; VI-NEXT: v_mov_b32_e32 v10, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v11 +; VI-NEXT: v_mov_b32_e32 v11, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s24, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v13 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_readfirstlane_b32 s21, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s19, v17 +; VI-NEXT: v_readfirstlane_b32 s18, v18 +; VI-NEXT: v_readfirstlane_b32 s17, v19 +; VI-NEXT: v_readfirstlane_b32 s16, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 ; VI-NEXT: v_readfirstlane_b32 s13, v0 ; VI-NEXT: v_readfirstlane_b32 s12, v1 ; VI-NEXT: v_readfirstlane_b32 s11, v2 @@ -7003,28 +7170,28 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v7 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -7035,123 +7202,123 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s11, s11, 3 ; VI-NEXT: s_add_i32 s12, s12, 3 ; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s40 ; VI-NEXT: s_lshl_b32 s5, s75, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s74, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s73, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s72, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s63, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s62, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s61, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s60, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s59, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s58, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s57, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s26, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s26 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s26, s73, 16 ; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s47, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s26, s63, 16 +; VI-NEXT: s_or_b32 s23, s23, s26 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s26, s62, 16 +; VI-NEXT: s_or_b32 s22, s22, s26 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s26, s61, 16 +; VI-NEXT: s_or_b32 s21, s21, s26 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s20, s20, s26 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s26, s59, 16 +; VI-NEXT: s_or_b32 s19, s19, s26 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s26, s58, 16 +; VI-NEXT: s_or_b32 s18, s18, s26 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s26, s57, 16 +; VI-NEXT: s_or_b32 s17, s17, s26 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s16, s16, s26 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s26, s47, 16 +; VI-NEXT: s_or_b32 s15, s15, s26 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s26, s46, 16 +; VI-NEXT: s_or_b32 s14, s14, s26 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s26, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s26 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s26 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s26, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s26 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s26, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s26 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s26, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s26 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s26, s29, 16 +; VI-NEXT: s_or_b32 s8, s8, s26 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s26, s28, 16 +; VI-NEXT: s_or_b32 s6, s6, s26 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 -; VI-NEXT: s_or_b32 s6, s6, s15 -; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_lshl_b32 s26, s27, 16 +; VI-NEXT: s_or_b32 s7, s7, s26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v3, s24 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 ; VI-NEXT: v_mov_b32_e32 v14, s13 ; VI-NEXT: v_mov_b32_e32 v15, s12 ; VI-NEXT: v_mov_b32_e32 v16, s11 @@ -7181,60 +7348,78 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v22i32_to_v44f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s24 +; GFX9-NEXT: v_mov_b32_e32 v18, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_readfirstlane_b32 s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s27 +; GFX9-NEXT: v_readfirstlane_b32 s7, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_readfirstlane_b32 s8, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s9, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s12, v15 +; GFX9-NEXT: v_readfirstlane_b32 s13, v16 +; GFX9-NEXT: v_readfirstlane_b32 s14, v17 +; GFX9-NEXT: v_readfirstlane_b32 s15, v18 +; GFX9-NEXT: v_readfirstlane_b32 s16, v19 +; GFX9-NEXT: v_readfirstlane_b32 s17, v9 +; GFX9-NEXT: v_readfirstlane_b32 s18, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v11 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 ; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: s_add_i32 s29, s29, 3 -; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 ; GFX9-NEXT: s_add_i32 s26, s26, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 @@ -7247,73 +7432,83 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: ; implicit-def: $sgpr75 @@ -7336,40 +7531,67 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: s_branch .LBB17_2 ; ; GFX11-LABEL: bitcast_v22i32_to_v44f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_mov_b32 v10, s17 +; GFX11-NEXT: v_dual_mov_b32 v11, s18 :: v_dual_mov_b32 v12, s19 +; GFX11-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s21 +; GFX11-NEXT: v_dual_mov_b32 v15, s22 :: v_dual_mov_b32 v16, s23 +; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-NEXT: v_dual_mov_b32 v21, s28 :: v_dual_mov_b32 v22, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v13 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_readfirstlane_b32 s10, v15 +; GFX11-NEXT: v_readfirstlane_b32 s11, v16 +; GFX11-NEXT: v_readfirstlane_b32 s12, v17 +; GFX11-NEXT: v_readfirstlane_b32 s13, v18 +; GFX11-NEXT: v_readfirstlane_b32 s14, v19 +; GFX11-NEXT: v_readfirstlane_b32 s15, v20 +; GFX11-NEXT: v_readfirstlane_b32 s16, v21 +; GFX11-NEXT: v_readfirstlane_b32 s17, v22 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: v_readfirstlane_b32 s20, v3 ; GFX11-NEXT: s_mov_b32 s62, 0 -; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s22, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -7377,46 +7599,46 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -7427,35 +7649,35 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s60 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s59 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s26, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s27, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s28, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s29, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s22 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s21 :: v_dual_mov_b32 v21, s20 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr61 @@ -7472,14 +7694,14 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 ; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13090,7 +13312,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload @@ -15559,397 +15781,446 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s19 +; SI-NEXT: v_mov_b32_e32 v61, s20 +; SI-NEXT: v_mov_b32_e32 v58, s21 +; SI-NEXT: v_mov_b32_e32 v56, s22 +; SI-NEXT: v_mov_b32_e32 v46, s23 +; SI-NEXT: v_mov_b32_e32 v45, s24 +; SI-NEXT: v_mov_b32_e32 v43, s25 +; SI-NEXT: v_mov_b32_e32 v59, s26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: v_mov_b32_e32 v60, s27 +; SI-NEXT: v_mov_b32_e32 v57, s28 +; SI-NEXT: v_mov_b32_e32 v47, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 -; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v46 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v62 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v12 +; SI-NEXT: s_branch .LBB33_3 +; SI-NEXT: .LBB33_2: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: .LBB33_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v14 +; SI-NEXT: v_mov_b32_e32 v14, v16 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: s_cbranch_vccnz .LBB33_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v61 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v46 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v60 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v57 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v12 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: .LBB33_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: ; VI: ; %bb.0: @@ -20231,7 +20502,35 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v11i64_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v10 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s13, v13 ; SI-NEXT: v_readfirstlane_b32 s10, v1 ; SI-NEXT: v_readfirstlane_b32 s11, v2 ; SI-NEXT: v_readfirstlane_b32 s8, v3 @@ -20239,7 +20538,7 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_readfirstlane_b32 s7, v6 ; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v8 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -20247,24 +20546,24 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s77, s7, 16 ; SI-NEXT: s_lshr_b32 s78, s9, 16 ; SI-NEXT: s_lshr_b32 s79, s11, 16 -; SI-NEXT: s_lshr_b32 s88, s29, 16 -; SI-NEXT: s_lshr_b32 s89, s27, 16 -; SI-NEXT: s_lshr_b32 s90, s25, 16 -; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 ; SI-NEXT: s_lshr_b32 s92, s21, 16 -; SI-NEXT: s_lshr_b32 s93, s19, 16 -; SI-NEXT: s_lshr_b32 s94, s17, 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -20275,139 +20574,139 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_add_u32 s10, s10, 3 ; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_lshr_b32 s76, s5, 16 ; SI-NEXT: s_lshr_b32 s77, s7, 16 ; SI-NEXT: s_lshr_b32 s78, s9, 16 ; SI-NEXT: s_lshr_b32 s79, s11, 16 -; SI-NEXT: s_lshr_b32 s88, s29, 16 -; SI-NEXT: s_lshr_b32 s89, s27, 16 -; SI-NEXT: s_lshr_b32 s90, s25, 16 -; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 ; SI-NEXT: s_lshr_b32 s92, s21, 16 -; SI-NEXT: s_lshr_b32 s93, s19, 16 -; SI-NEXT: s_lshr_b32 s94, s17, 16 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s13, s72, 16 -; SI-NEXT: s_and_b32 s15, s16, 0xffff -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_and_b32 s13, s17, 0xffff -; SI-NEXT: s_lshl_b32 s15, s94, 16 -; SI-NEXT: s_or_b32 s13, s13, s15 -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_lshl_b32 s13, s62, 16 -; SI-NEXT: s_and_b32 s15, s18, 0xffff -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: s_and_b32 s13, s19, 0xffff -; SI-NEXT: s_lshl_b32 s15, s93, 16 -; SI-NEXT: s_or_b32 s13, s13, s15 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: s_lshl_b32 s13, s60, 16 -; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s94, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s93, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s21, 0xffff -; SI-NEXT: s_lshl_b32 s15, s92, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s92, 16 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s22, 0xffff -; SI-NEXT: s_lshl_b32 s15, s58, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s58, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s23, 0xffff -; SI-NEXT: s_lshl_b32 s15, s91, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s91, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s24, 0xffff -; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s90, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s26, 0xffff -; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff ; SI-NEXT: s_lshl_b32 s15, s89, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s28, 0xffff -; SI-NEXT: s_lshl_b32 s15, s44, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: s_and_b32 s13, s29, 0xffff -; SI-NEXT: s_lshl_b32 s15, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_lshl_b32 s12, s42, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s10, s10, s13 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 @@ -20433,7 +20732,7 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -20447,7 +20746,7 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -20482,16 +20781,44 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v11i64_to_v44i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v17, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v10 +; VI-NEXT: v_mov_b32_e32 v10, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v11 +; VI-NEXT: v_mov_b32_e32 v11, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s24, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v13 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_readfirstlane_b32 s21, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s19, v17 +; VI-NEXT: v_readfirstlane_b32 s18, v18 +; VI-NEXT: v_readfirstlane_b32 s17, v19 +; VI-NEXT: v_readfirstlane_b32 s16, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 ; VI-NEXT: v_readfirstlane_b32 s13, v0 ; VI-NEXT: v_readfirstlane_b32 s12, v1 ; VI-NEXT: v_readfirstlane_b32 s11, v2 @@ -20503,28 +20830,28 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v7 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -20535,123 +20862,123 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s10, s10, 0 ; VI-NEXT: s_add_u32 s13, s13, 3 ; VI-NEXT: s_addc_u32 s12, s12, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s40 ; VI-NEXT: s_lshl_b32 s5, s75, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s74, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s73, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s72, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s63, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s62, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s61, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s60, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s59, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s58, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s57, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s26, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s26 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s26, s73, 16 ; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s47, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s26, s63, 16 +; VI-NEXT: s_or_b32 s23, s23, s26 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s26, s62, 16 +; VI-NEXT: s_or_b32 s22, s22, s26 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s26, s61, 16 +; VI-NEXT: s_or_b32 s21, s21, s26 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s20, s20, s26 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s26, s59, 16 +; VI-NEXT: s_or_b32 s19, s19, s26 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s26, s58, 16 +; VI-NEXT: s_or_b32 s18, s18, s26 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s26, s57, 16 +; VI-NEXT: s_or_b32 s17, s17, s26 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s16, s16, s26 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s26, s47, 16 +; VI-NEXT: s_or_b32 s15, s15, s26 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s26, s46, 16 +; VI-NEXT: s_or_b32 s14, s14, s26 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s26, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s26 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s26 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s26, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s26 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s26, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s26 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s26, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s26 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s26, s29, 16 +; VI-NEXT: s_or_b32 s8, s8, s26 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s26, s28, 16 +; VI-NEXT: s_or_b32 s6, s6, s26 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 -; VI-NEXT: s_or_b32 s6, s6, s15 -; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_lshl_b32 s26, s27, 16 +; VI-NEXT: s_or_b32 s7, s7, s26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v3, s24 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 ; VI-NEXT: v_mov_b32_e32 v14, s13 ; VI-NEXT: v_mov_b32_e32 v15, s12 ; VI-NEXT: v_mov_b32_e32 v16, s11 @@ -20681,60 +21008,78 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v11i64_to_v44i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s24 +; GFX9-NEXT: v_mov_b32_e32 v18, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_readfirstlane_b32 s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s27 +; GFX9-NEXT: v_readfirstlane_b32 s7, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_readfirstlane_b32 s8, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s9, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s12, v15 +; GFX9-NEXT: v_readfirstlane_b32 s13, v16 +; GFX9-NEXT: v_readfirstlane_b32 s14, v17 +; GFX9-NEXT: v_readfirstlane_b32 s15, v18 +; GFX9-NEXT: v_readfirstlane_b32 s16, v19 +; GFX9-NEXT: v_readfirstlane_b32 s17, v9 +; GFX9-NEXT: v_readfirstlane_b32 s18, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v11 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 ; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 ; GFX9-NEXT: s_addc_u32 s27, s27, 0 ; GFX9-NEXT: s_add_u32 s24, s24, 3 @@ -20747,73 +21092,83 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB41_4: ; GFX9-NEXT: ; implicit-def: $sgpr75 @@ -20836,40 +21191,67 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: s_branch .LBB41_2 ; ; GFX11-LABEL: bitcast_v11i64_to_v44i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_mov_b32 v10, s17 +; GFX11-NEXT: v_dual_mov_b32 v11, s18 :: v_dual_mov_b32 v12, s19 +; GFX11-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s21 +; GFX11-NEXT: v_dual_mov_b32 v15, s22 :: v_dual_mov_b32 v16, s23 +; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-NEXT: v_dual_mov_b32 v21, s28 :: v_dual_mov_b32 v22, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v13 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_readfirstlane_b32 s10, v15 +; GFX11-NEXT: v_readfirstlane_b32 s11, v16 +; GFX11-NEXT: v_readfirstlane_b32 s12, v17 +; GFX11-NEXT: v_readfirstlane_b32 s13, v18 +; GFX11-NEXT: v_readfirstlane_b32 s14, v19 +; GFX11-NEXT: v_readfirstlane_b32 s15, v20 +; GFX11-NEXT: v_readfirstlane_b32 s16, v21 +; GFX11-NEXT: v_readfirstlane_b32 s17, v22 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: v_readfirstlane_b32 s20, v3 ; GFX11-NEXT: s_mov_b32 s62, 0 -; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s22, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -20877,46 +21259,46 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s7, s7, 3 -; GFX11-NEXT: s_addc_u32 s6, s6, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 -; GFX11-NEXT: s_add_u32 s20, s20, 3 -; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s21, s21, 3 +; GFX11-NEXT: s_addc_u32 s20, s20, 0 ; GFX11-NEXT: s_add_u32 s18, s18, 3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -20927,35 +21309,35 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s60 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s59 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s26, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s27, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s28, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s29, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s22 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s21 :: v_dual_mov_b32 v21, s20 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr61 @@ -20972,14 +21354,14 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 ; GFX11-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21338,7 +21720,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload @@ -23840,7 +24222,35 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-LABEL: bitcast_v11i64_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s22, v10 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v13 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v19 +; SI-NEXT: v_readfirstlane_b32 s16, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v13 ; SI-NEXT: v_readfirstlane_b32 s12, v1 ; SI-NEXT: v_readfirstlane_b32 s13, v2 ; SI-NEXT: v_readfirstlane_b32 s10, v3 @@ -23868,33 +24278,33 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 @@ -23904,50 +24314,50 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s14, s4, 16 -; SI-NEXT: s_lshr_b32 s15, s5, 16 -; SI-NEXT: s_add_u32 s16, s18, 3 -; SI-NEXT: s_addc_u32 s17, s19, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s40, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s21, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s42, s22, 16 -; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_addc_u32 s5, s26, 0 +; SI-NEXT: s_lshr_b32 s22, s4, 16 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s28, s23, 16 +; SI-NEXT: s_lshr_b32 s29, s27, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 ; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s44, s24, 16 -; SI-NEXT: s_lshr_b32 s45, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: s_lshr_b32 s47, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s56, s28, 16 -; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s46, s16, 16 +; SI-NEXT: s_lshr_b32 s47, s17, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s15, 16 ; SI-NEXT: s_add_u32 s12, s12, 3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_lshr_b32 s58, s12, 16 @@ -23972,18 +24382,18 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 @@ -24004,10 +24414,10 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 @@ -24213,7 +24623,35 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v16, s23 +; VI-NEXT: v_mov_b32_e32 v17, s24 +; VI-NEXT: v_mov_b32_e32 v18, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v10 +; VI-NEXT: v_mov_b32_e32 v10, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v11 +; VI-NEXT: v_mov_b32_e32 v11, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s24, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v13 +; VI-NEXT: v_readfirstlane_b32 s22, v14 +; VI-NEXT: v_readfirstlane_b32 s21, v15 +; VI-NEXT: v_readfirstlane_b32 s20, v16 +; VI-NEXT: v_readfirstlane_b32 s19, v17 +; VI-NEXT: v_readfirstlane_b32 s18, v18 +; VI-NEXT: v_readfirstlane_b32 s17, v19 +; VI-NEXT: v_readfirstlane_b32 s16, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 ; VI-NEXT: v_readfirstlane_b32 s13, v0 ; VI-NEXT: v_readfirstlane_b32 s12, v1 ; VI-NEXT: v_readfirstlane_b32 s11, v2 @@ -24225,28 +24663,28 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v7 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -24257,123 +24695,123 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s10, s10, 0 ; VI-NEXT: s_add_u32 s13, s13, 3 ; VI-NEXT: s_addc_u32 s12, s12, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s41, s9, 16 ; VI-NEXT: s_lshr_b32 s42, s10, 16 ; VI-NEXT: s_lshr_b32 s43, s11, 16 ; VI-NEXT: s_lshr_b32 s44, s12, 16 ; VI-NEXT: s_lshr_b32 s45, s13, 16 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: s_lshr_b32 s56, s27, 16 -; VI-NEXT: s_lshr_b32 s57, s26, 16 -; VI-NEXT: s_lshr_b32 s58, s25, 16 -; VI-NEXT: s_lshr_b32 s59, s24, 16 -; VI-NEXT: s_lshr_b32 s60, s23, 16 -; VI-NEXT: s_lshr_b32 s61, s22, 16 -; VI-NEXT: s_lshr_b32 s62, s21, 16 -; VI-NEXT: s_lshr_b32 s63, s20, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 16 -; VI-NEXT: s_lshr_b32 s73, s18, 16 -; VI-NEXT: s_lshr_b32 s74, s17, 16 -; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_lshr_b32 s47, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s18, 16 +; VI-NEXT: s_lshr_b32 s59, s19, 16 +; VI-NEXT: s_lshr_b32 s60, s20, 16 +; VI-NEXT: s_lshr_b32 s61, s21, 16 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s63, s23, 16 +; VI-NEXT: s_lshr_b32 s72, s24, 16 +; VI-NEXT: s_lshr_b32 s73, s25, 16 +; VI-NEXT: s_lshr_b32 s74, s26, 16 +; VI-NEXT: s_lshr_b32 s75, s40, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s40 ; VI-NEXT: s_lshl_b32 s5, s75, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s74, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s73, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s72, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s63, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s62, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s61, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s60, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s59, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s58, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s57, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_and_b32 s5, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s26, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s26 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s26, s73, 16 ; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s47, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s26, s63, 16 +; VI-NEXT: s_or_b32 s23, s23, s26 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s26, s62, 16 +; VI-NEXT: s_or_b32 s22, s22, s26 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s26, s61, 16 +; VI-NEXT: s_or_b32 s21, s21, s26 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s20, s20, s26 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s26, s59, 16 +; VI-NEXT: s_or_b32 s19, s19, s26 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s26, s58, 16 +; VI-NEXT: s_or_b32 s18, s18, s26 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s26, s57, 16 +; VI-NEXT: s_or_b32 s17, s17, s26 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s16, s16, s26 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s26, s47, 16 +; VI-NEXT: s_or_b32 s15, s15, s26 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s26, s46, 16 +; VI-NEXT: s_or_b32 s14, s14, s26 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s26, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s26 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s26 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s26, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s26 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s26, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s26 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s26, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s26 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_lshl_b32 s26, s29, 16 +; VI-NEXT: s_or_b32 s8, s8, s26 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_lshl_b32 s26, s28, 16 +; VI-NEXT: s_or_b32 s6, s6, s26 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s14, s14, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 -; VI-NEXT: s_or_b32 s6, s6, s15 -; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_lshl_b32 s26, s27, 16 +; VI-NEXT: s_or_b32 s7, s7, s26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v3, s24 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s20 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 ; VI-NEXT: v_mov_b32_e32 v14, s13 ; VI-NEXT: v_mov_b32_e32 v15, s12 ; VI-NEXT: v_mov_b32_e32 v16, s11 @@ -24403,60 +24841,78 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v11i64_to_v44f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s22 +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s24 +; GFX9-NEXT: v_mov_b32_e32 v18, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_readfirstlane_b32 s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v9, s27 +; GFX9-NEXT: v_readfirstlane_b32 s7, v10 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_readfirstlane_b32 s8, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s9, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s12, v15 +; GFX9-NEXT: v_readfirstlane_b32 s13, v16 +; GFX9-NEXT: v_readfirstlane_b32 s14, v17 +; GFX9-NEXT: v_readfirstlane_b32 s15, v18 +; GFX9-NEXT: v_readfirstlane_b32 s16, v19 +; GFX9-NEXT: v_readfirstlane_b32 s17, v9 +; GFX9-NEXT: v_readfirstlane_b32 s18, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v11 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 ; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_add_u32 s28, s28, 3 -; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 ; GFX9-NEXT: s_addc_u32 s27, s27, 0 ; GFX9-NEXT: s_add_u32 s24, s24, 3 @@ -24469,73 +24925,83 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_lshr_b32 s15, s12, 16 -; GFX9-NEXT: s_lshr_b32 s40, s11, 16 -; GFX9-NEXT: s_lshr_b32 s41, s10, 16 -; GFX9-NEXT: s_lshr_b32 s42, s9, 16 -; GFX9-NEXT: s_lshr_b32 s43, s8, 16 -; GFX9-NEXT: s_lshr_b32 s44, s7, 16 -; GFX9-NEXT: s_lshr_b32 s45, s6, 16 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: s_lshr_b32 s47, s28, 16 -; GFX9-NEXT: s_lshr_b32 s56, s27, 16 -; GFX9-NEXT: s_lshr_b32 s57, s26, 16 -; GFX9-NEXT: s_lshr_b32 s58, s25, 16 -; GFX9-NEXT: s_lshr_b32 s59, s24, 16 -; GFX9-NEXT: s_lshr_b32 s60, s23, 16 -; GFX9-NEXT: s_lshr_b32 s61, s22, 16 -; GFX9-NEXT: s_lshr_b32 s62, s21, 16 -; GFX9-NEXT: s_lshr_b32 s63, s20, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 16 -; GFX9-NEXT: s_lshr_b32 s73, s18, 16 -; GFX9-NEXT: s_lshr_b32 s74, s17, 16 -; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s28, s27, 16 +; GFX9-NEXT: s_lshr_b32 s29, s26, 16 +; GFX9-NEXT: s_lshr_b32 s40, s25, 16 +; GFX9-NEXT: s_lshr_b32 s41, s24, 16 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 16 +; GFX9-NEXT: s_lshr_b32 s45, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s15, 16 +; GFX9-NEXT: s_lshr_b32 s59, s14, 16 +; GFX9-NEXT: s_lshr_b32 s60, s13, 16 +; GFX9-NEXT: s_lshr_b32 s61, s12, 16 +; GFX9-NEXT: s_lshr_b32 s62, s11, 16 +; GFX9-NEXT: s_lshr_b32 s63, s10, 16 +; GFX9-NEXT: s_lshr_b32 s72, s9, 16 +; GFX9-NEXT: s_lshr_b32 s73, s8, 16 +; GFX9-NEXT: s_lshr_b32 s74, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s29 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s28 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB45_4: ; GFX9-NEXT: ; implicit-def: $sgpr75 @@ -24558,40 +25024,67 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr41 ; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: s_branch .LBB45_2 ; ; GFX11-LABEL: bitcast_v11i64_to_v44f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_mov_b32 v10, s17 +; GFX11-NEXT: v_dual_mov_b32 v11, s18 :: v_dual_mov_b32 v12, s19 +; GFX11-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s21 +; GFX11-NEXT: v_dual_mov_b32 v15, s22 :: v_dual_mov_b32 v16, s23 +; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-NEXT: v_dual_mov_b32 v21, s28 :: v_dual_mov_b32 v22, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s8, v13 +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_readfirstlane_b32 s10, v15 +; GFX11-NEXT: v_readfirstlane_b32 s11, v16 +; GFX11-NEXT: v_readfirstlane_b32 s12, v17 +; GFX11-NEXT: v_readfirstlane_b32 s13, v18 +; GFX11-NEXT: v_readfirstlane_b32 s14, v19 +; GFX11-NEXT: v_readfirstlane_b32 s15, v20 +; GFX11-NEXT: v_readfirstlane_b32 s16, v21 +; GFX11-NEXT: v_readfirstlane_b32 s17, v22 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: v_readfirstlane_b32 s20, v3 ; GFX11-NEXT: s_mov_b32 s62, 0 -; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s22, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -24599,46 +25092,46 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s7, s7, 3 -; GFX11-NEXT: s_addc_u32 s6, s6, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 -; GFX11-NEXT: s_add_u32 s20, s20, 3 -; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s21, s21, 3 +; GFX11-NEXT: s_addc_u32 s20, s20, 0 ; GFX11-NEXT: s_add_u32 s18, s18, 3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_lshr_b32 s12, s29, 16 -; GFX11-NEXT: s_lshr_b32 s13, s28, 16 -; GFX11-NEXT: s_lshr_b32 s14, s27, 16 -; GFX11-NEXT: s_lshr_b32 s15, s26, 16 -; GFX11-NEXT: s_lshr_b32 s40, s25, 16 -; GFX11-NEXT: s_lshr_b32 s41, s24, 16 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s44, s21, 16 -; GFX11-NEXT: s_lshr_b32 s45, s20, 16 -; GFX11-NEXT: s_lshr_b32 s46, s19, 16 -; GFX11-NEXT: s_lshr_b32 s47, s18, 16 -; GFX11-NEXT: s_lshr_b32 s56, s17, 16 -; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s22, s20, 16 +; GFX11-NEXT: s_lshr_b32 s23, s21, 16 +; GFX11-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-NEXT: s_lshr_b32 s26, s17, 16 +; GFX11-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s15, 16 +; GFX11-NEXT: s_lshr_b32 s29, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 +; GFX11-NEXT: s_lshr_b32 s41, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s43, s10, 16 +; GFX11-NEXT: s_lshr_b32 s44, s9, 16 +; GFX11-NEXT: s_lshr_b32 s45, s8, 16 +; GFX11-NEXT: s_lshr_b32 s46, s7, 16 +; GFX11-NEXT: s_lshr_b32 s47, s6, 16 +; GFX11-NEXT: s_lshr_b32 s56, s5, 16 +; GFX11-NEXT: s_lshr_b32 s57, s4, 16 ; GFX11-NEXT: s_lshr_b32 s58, s3, 16 ; GFX11-NEXT: s_lshr_b32 s59, s2, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 16 @@ -24649,35 +25142,35 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s60 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s59 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s26, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s27, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s28, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s29, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s9 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s24 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s23 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s22 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s21 :: v_dual_mov_b32 v21, s20 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr61 @@ -24694,14 +25187,14 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr8 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr22 ; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -27798,21 +28291,21 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v23, s17 -; VI-NEXT: v_mov_b32_e32 v20, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 -; VI-NEXT: v_mov_b32_e32 v18, s20 -; VI-NEXT: v_mov_b32_e32 v19, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 ; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v16, s26 -; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -27823,33 +28316,33 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -27858,56 +28351,56 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -27934,11 +28427,11 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 @@ -27955,21 +28448,21 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v23, s17 -; GFX9-NEXT: v_mov_b32_e32 v20, s18 -; GFX9-NEXT: v_mov_b32_e32 v21, s19 -; GFX9-NEXT: v_mov_b32_e32 v18, s20 -; GFX9-NEXT: v_mov_b32_e32 v19, s21 -; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 ; GFX9-NEXT: v_mov_b32_e32 v9, s24 ; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -27980,33 +28473,33 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -28015,62 +28508,62 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; GFX9-NEXT: .LBB49_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 @@ -28091,11 +28584,11 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -28753,7 +29246,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload @@ -31178,408 +31671,444 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_and_b64 s[12:13], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v22, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s12, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 -; SI-NEXT: s_lshr_b32 s12, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 -; SI-NEXT: s_lshr_b32 s12, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 -; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 -; SI-NEXT: s_lshr_b32 s12, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: s_lshr_b32 s12, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s12, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: s_lshr_b32 s12, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: s_lshr_b32 s12, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: s_lshr_b32 s12, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: s_lshr_b32 s12, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: s_lshr_b32 s12, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[33:34], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[29:30], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v22, s16 -; VI-NEXT: v_mov_b32_e32 v23, s17 -; VI-NEXT: v_mov_b32_e32 v20, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 -; VI-NEXT: v_mov_b32_e32 v18, s20 -; VI-NEXT: v_mov_b32_e32 v19, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 ; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v16, s26 -; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_mov_b32_e32 v16, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v12, s28 -; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -31590,33 +32119,33 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -31625,56 +32154,56 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31701,11 +32230,11 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 @@ -31722,21 +32251,21 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_mov_b32_e32 v22, s16 -; GFX9-NEXT: v_mov_b32_e32 v23, s17 -; GFX9-NEXT: v_mov_b32_e32 v20, s18 -; GFX9-NEXT: v_mov_b32_e32 v21, s19 -; GFX9-NEXT: v_mov_b32_e32 v18, s20 -; GFX9-NEXT: v_mov_b32_e32 v19, s21 -; GFX9-NEXT: v_mov_b32_e32 v14, s22 -; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 ; GFX9-NEXT: v_mov_b32_e32 v9, s24 ; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v12, s28 -; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -31747,33 +32276,33 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -31782,62 +32311,62 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 ; GFX9-NEXT: .LBB53_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 @@ -31858,11 +32387,11 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 9ec3f5c00ee23..530ff4f30fd05 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -3190,13 +3190,41 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v12, s30, 0 -; SI-NEXT: v_writelane_b32 v12, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v13 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_readfirstlane_b32 s15, v17 ; SI-NEXT: v_readfirstlane_b32 s12, v1 ; SI-NEXT: v_readfirstlane_b32 s13, v2 ; SI-NEXT: v_readfirstlane_b32 s10, v3 @@ -3206,9 +3234,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: v_readfirstlane_b32 s7, v8 ; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_writelane_b32 v12, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s88, s5, 16 @@ -3216,41 +3244,41 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s90, s9, 16 ; SI-NEXT: s_lshr_b32 s91, s11, 16 ; SI-NEXT: s_lshr_b32 s92, s13, 16 -; SI-NEXT: s_lshr_b32 s93, s29, 16 -; SI-NEXT: s_lshr_b32 s94, s27, 16 -; SI-NEXT: s_lshr_b32 s95, s25, 16 -; SI-NEXT: s_lshr_b32 s30, s23, 16 -; SI-NEXT: s_lshr_b32 s31, s21, 16 -; SI-NEXT: s_lshr_b32 s34, s19, 16 -; SI-NEXT: s_lshr_b32 s35, s17, 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 @@ -3261,128 +3289,128 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 ; SI-NEXT: s_lshr_b32 s88, s5, 16 ; SI-NEXT: s_lshr_b32 s89, s7, 16 ; SI-NEXT: s_lshr_b32 s90, s9, 16 ; SI-NEXT: s_lshr_b32 s91, s11, 16 ; SI-NEXT: s_lshr_b32 s92, s13, 16 -; SI-NEXT: s_lshr_b32 s93, s29, 16 -; SI-NEXT: s_lshr_b32 s94, s27, 16 -; SI-NEXT: s_lshr_b32 s95, s25, 16 -; SI-NEXT: s_lshr_b32 s30, s23, 16 -; SI-NEXT: s_lshr_b32 s31, s21, 16 -; SI-NEXT: s_lshr_b32 s34, s19, 16 -; SI-NEXT: s_lshr_b32 s35, s17, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s15, s76, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: s_and_b32 s15, s17, 0xffff -; SI-NEXT: s_lshl_b32 s16, s35, 16 -; SI-NEXT: s_or_b32 s15, s15, s16 -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_lshl_b32 s15, s74, 16 -; SI-NEXT: s_and_b32 s16, s18, 0xffff -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: s_and_b32 s15, s19, 0xffff -; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s35, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s20, 0xffff -; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s21, 0xffff -; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s31, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s22, 0xffff -; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s24, 0xffff -; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s60, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s25, 0xffff -; SI-NEXT: s_lshl_b32 s16, s95, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s95, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s26, 0xffff -; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s58, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s27, 0xffff -; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s28, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_lshl_b32 s16, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s29, 0xffff -; SI-NEXT: s_lshl_b32 s16, s93, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s93, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: s_lshl_b32 s14, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s12, s12, s15 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 @@ -3422,7 +3450,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3436,7 +3464,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3450,12 +3478,12 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s35, v12, 3 -; SI-NEXT: v_readlane_b32 s34, v12, 2 -; SI-NEXT: v_readlane_b32 s31, v12, 1 -; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3480,16 +3508,44 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v24i32_to_v48i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v18, s23 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v13 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v14 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v15 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v19 +; VI-NEXT: v_readfirstlane_b32 s20, v11 +; VI-NEXT: v_readfirstlane_b32 s19, v12 +; VI-NEXT: v_readfirstlane_b32 s18, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s16, v15 ; VI-NEXT: v_readfirstlane_b32 s15, v0 ; VI-NEXT: v_readfirstlane_b32 s14, v1 ; VI-NEXT: v_readfirstlane_b32 s13, v2 @@ -3503,9 +3559,9 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v9 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -3513,20 +3569,20 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -3539,23 +3595,23 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -3563,107 +3619,107 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s42 ; VI-NEXT: s_lshl_b32 s5, s79, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s78, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s77, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s76, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s75, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s74, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s73, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s72, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s63, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s62, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s61, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s60, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s59, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s58, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s5, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s41, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s41 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s41, s77, 16 +; VI-NEXT: s_or_b32 s40, s40, s41 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s41, s76, 16 +; VI-NEXT: s_or_b32 s26, s26, s41 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s41, s75, 16 +; VI-NEXT: s_or_b32 s25, s25, s41 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s41, s74, 16 +; VI-NEXT: s_or_b32 s24, s24, s41 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s41, s73, 16 +; VI-NEXT: s_or_b32 s23, s23, s41 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s41, s72, 16 +; VI-NEXT: s_or_b32 s22, s22, s41 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s41, s63, 16 +; VI-NEXT: s_or_b32 s21, s21, s41 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s41, s62, 16 +; VI-NEXT: s_or_b32 s20, s20, s41 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s41, s61, 16 +; VI-NEXT: s_or_b32 s19, s19, s41 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s41, s60, 16 +; VI-NEXT: s_or_b32 s18, s18, s41 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s41, s59, 16 +; VI-NEXT: s_or_b32 s17, s17, s41 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s41, s58, 16 +; VI-NEXT: s_or_b32 s16, s16, s41 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s28, s57, 16 -; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_lshl_b32 s41, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s41 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s28, s56, 16 -; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_lshl_b32 s41, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s41 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s47, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s41, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s41 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s41, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s41 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s41, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s41 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s41, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s41 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s41, s43, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s28, s40, 16 -; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s41 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s40 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: v_mov_b32_e32 v4, s25 +; VI-NEXT: v_mov_b32_e32 v5, s24 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s16 ; VI-NEXT: v_mov_b32_e32 v14, s15 ; VI-NEXT: v_mov_b32_e32 v15, s14 ; VI-NEXT: v_mov_b32_e32 v16, s13 @@ -3697,64 +3753,82 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr45 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v24i32_to_v48i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s6, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_readfirstlane_b32 s7, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_readfirstlane_b32 s10, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s11, v16 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_readfirstlane_b32 s15, v11 +; GFX9-NEXT: v_readfirstlane_b32 s16, v12 +; GFX9-NEXT: v_readfirstlane_b32 s17, v13 +; GFX9-NEXT: v_readfirstlane_b32 s18, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v15 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -3769,79 +3843,89 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s40 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -3873,37 +3957,64 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; GFX11-LABEL: bitcast_v24i32_to_v48i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v12, s17 +; GFX11-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-NEXT: v_dual_mov_b32 v15, s20 :: v_dual_mov_b32 v16, s21 +; GFX11-NEXT: v_dual_mov_b32 v17, s22 :: v_dual_mov_b32 v18, s23 +; GFX11-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v20, s25 +; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-NEXT: v_dual_mov_b32 v23, s28 :: v_dual_mov_b32 v24, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s9, v4 -; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v7 +; GFX11-NEXT: v_readfirstlane_b32 s1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: v_readfirstlane_b32 s8, v15 +; GFX11-NEXT: v_readfirstlane_b32 s9, v16 +; GFX11-NEXT: v_readfirstlane_b32 s10, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v18 +; GFX11-NEXT: v_readfirstlane_b32 s12, v19 +; GFX11-NEXT: v_readfirstlane_b32 s13, v20 +; GFX11-NEXT: v_readfirstlane_b32 s14, v21 +; GFX11-NEXT: v_readfirstlane_b32 s15, v22 +; GFX11-NEXT: v_readfirstlane_b32 s16, v23 +; GFX11-NEXT: v_readfirstlane_b32 s17, v24 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s23, v4 +; GFX11-NEXT: v_readfirstlane_b32 s22, v5 ; GFX11-NEXT: s_mov_b32 s74, 0 -; GFX11-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s24, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -3911,50 +4022,50 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s20, s20, 3 ; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -3965,38 +4076,38 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s72 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s63 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s24 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s23 :: v_dual_mov_b32 v23, s22 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr73 @@ -4017,12 +4128,12 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 ; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4420,7 +4531,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -7204,7 +7315,35 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v14 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v15 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s29, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s17, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 ; SI-NEXT: v_readfirstlane_b32 s15, v1 ; SI-NEXT: v_readfirstlane_b32 s14, v2 ; SI-NEXT: v_readfirstlane_b32 s13, v3 @@ -7239,33 +7378,33 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -7278,29 +7417,22 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 @@ -7308,6 +7440,13 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -7318,20 +7457,20 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_lshr_b32 s42, s20, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_lshr_b32 s44, s22, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s26, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s28, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s5, s24, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s27, 16 +; SI-NEXT: s_lshr_b32 s43, s28, 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_lshr_b32 s60, s15, 16 ; SI-NEXT: s_lshr_b32 s61, s14, 16 ; SI-NEXT: s_lshr_b32 s62, s13, 16 @@ -7352,20 +7491,20 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 @@ -7616,7 +7755,35 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v18, s23 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v13 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v14 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v15 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v19 +; VI-NEXT: v_readfirstlane_b32 s20, v11 +; VI-NEXT: v_readfirstlane_b32 s19, v12 +; VI-NEXT: v_readfirstlane_b32 s18, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s16, v15 ; VI-NEXT: v_readfirstlane_b32 s15, v0 ; VI-NEXT: v_readfirstlane_b32 s14, v1 ; VI-NEXT: v_readfirstlane_b32 s13, v2 @@ -7630,9 +7797,9 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v9 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -7640,20 +7807,20 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -7666,23 +7833,23 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -7690,107 +7857,107 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s42 ; VI-NEXT: s_lshl_b32 s5, s79, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s78, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s77, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s76, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s75, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s74, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s73, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s72, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s63, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s62, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s61, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s60, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s59, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s58, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s5, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s41, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s41 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s41, s77, 16 +; VI-NEXT: s_or_b32 s40, s40, s41 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s41, s76, 16 +; VI-NEXT: s_or_b32 s26, s26, s41 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s41, s75, 16 +; VI-NEXT: s_or_b32 s25, s25, s41 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s41, s74, 16 +; VI-NEXT: s_or_b32 s24, s24, s41 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s41, s73, 16 +; VI-NEXT: s_or_b32 s23, s23, s41 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s41, s72, 16 +; VI-NEXT: s_or_b32 s22, s22, s41 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s41, s63, 16 +; VI-NEXT: s_or_b32 s21, s21, s41 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s41, s62, 16 +; VI-NEXT: s_or_b32 s20, s20, s41 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s41, s61, 16 +; VI-NEXT: s_or_b32 s19, s19, s41 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s41, s60, 16 +; VI-NEXT: s_or_b32 s18, s18, s41 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s41, s59, 16 +; VI-NEXT: s_or_b32 s17, s17, s41 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s41, s58, 16 +; VI-NEXT: s_or_b32 s16, s16, s41 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s28, s57, 16 -; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_lshl_b32 s41, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s41 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s28, s56, 16 -; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_lshl_b32 s41, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s41 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s47, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s41, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s41 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s41, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s41 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s41, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s41 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s41, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s41 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s41, s43, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s28, s40, 16 -; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s41 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s40 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: v_mov_b32_e32 v4, s25 +; VI-NEXT: v_mov_b32_e32 v5, s24 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s16 ; VI-NEXT: v_mov_b32_e32 v14, s15 ; VI-NEXT: v_mov_b32_e32 v15, s14 ; VI-NEXT: v_mov_b32_e32 v16, s13 @@ -7824,64 +7991,82 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr45 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v24i32_to_v48f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s6, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_readfirstlane_b32 s7, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_readfirstlane_b32 s10, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s11, v16 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_readfirstlane_b32 s15, v11 +; GFX9-NEXT: v_readfirstlane_b32 s16, v12 +; GFX9-NEXT: v_readfirstlane_b32 s17, v13 +; GFX9-NEXT: v_readfirstlane_b32 s18, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v15 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -7896,79 +8081,89 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s40 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -8000,37 +8195,64 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; GFX11-LABEL: bitcast_v24i32_to_v48f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v12, s17 +; GFX11-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-NEXT: v_dual_mov_b32 v15, s20 :: v_dual_mov_b32 v16, s21 +; GFX11-NEXT: v_dual_mov_b32 v17, s22 :: v_dual_mov_b32 v18, s23 +; GFX11-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v20, s25 +; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-NEXT: v_dual_mov_b32 v23, s28 :: v_dual_mov_b32 v24, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s9, v4 -; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v7 +; GFX11-NEXT: v_readfirstlane_b32 s1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: v_readfirstlane_b32 s8, v15 +; GFX11-NEXT: v_readfirstlane_b32 s9, v16 +; GFX11-NEXT: v_readfirstlane_b32 s10, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v18 +; GFX11-NEXT: v_readfirstlane_b32 s12, v19 +; GFX11-NEXT: v_readfirstlane_b32 s13, v20 +; GFX11-NEXT: v_readfirstlane_b32 s14, v21 +; GFX11-NEXT: v_readfirstlane_b32 s15, v22 +; GFX11-NEXT: v_readfirstlane_b32 s16, v23 +; GFX11-NEXT: v_readfirstlane_b32 s17, v24 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s23, v4 +; GFX11-NEXT: v_readfirstlane_b32 s22, v5 ; GFX11-NEXT: s_mov_b32 s74, 0 -; GFX11-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s24, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -8038,50 +8260,50 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s21, s21, 3 ; GFX11-NEXT: s_add_i32 s20, s20, 3 ; GFX11-NEXT: s_add_i32 s19, s19, 3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -8092,38 +8314,38 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s72 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s63 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s24 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s23 :: v_dual_mov_b32 v23, s22 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr73 @@ -8144,12 +8366,12 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 ; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14250,7 +14472,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -17011,440 +17233,502 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s20 +; SI-NEXT: v_mov_b32_e32 v59, s21 +; SI-NEXT: v_mov_b32_e32 v58, s22 +; SI-NEXT: v_mov_b32_e32 v57, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s28 +; SI-NEXT: v_mov_b32_e32 v60, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v60 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: @@ -22113,13 +22397,41 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v12, s30, 0 -; SI-NEXT: v_writelane_b32 v12, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v13 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_readfirstlane_b32 s15, v17 ; SI-NEXT: v_readfirstlane_b32 s12, v1 ; SI-NEXT: v_readfirstlane_b32 s13, v2 ; SI-NEXT: v_readfirstlane_b32 s10, v3 @@ -22129,9 +22441,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: v_readfirstlane_b32 s7, v8 ; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_writelane_b32 v12, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s88, s5, 16 @@ -22139,25 +22451,25 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s90, s9, 16 ; SI-NEXT: s_lshr_b32 s91, s11, 16 ; SI-NEXT: s_lshr_b32 s92, s13, 16 -; SI-NEXT: s_lshr_b32 s93, s29, 16 -; SI-NEXT: s_lshr_b32 s94, s27, 16 -; SI-NEXT: s_lshr_b32 s95, s25, 16 -; SI-NEXT: s_lshr_b32 s30, s23, 16 -; SI-NEXT: s_lshr_b32 s31, s21, 16 -; SI-NEXT: s_lshr_b32 s34, s19, 16 -; SI-NEXT: s_lshr_b32 s35, s17, 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -22170,142 +22482,142 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s11, s11, 0 ; SI-NEXT: s_add_u32 s12, s12, 3 ; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 ; SI-NEXT: s_lshr_b32 s88, s5, 16 ; SI-NEXT: s_lshr_b32 s89, s7, 16 ; SI-NEXT: s_lshr_b32 s90, s9, 16 ; SI-NEXT: s_lshr_b32 s91, s11, 16 ; SI-NEXT: s_lshr_b32 s92, s13, 16 -; SI-NEXT: s_lshr_b32 s93, s29, 16 -; SI-NEXT: s_lshr_b32 s94, s27, 16 -; SI-NEXT: s_lshr_b32 s95, s25, 16 -; SI-NEXT: s_lshr_b32 s30, s23, 16 -; SI-NEXT: s_lshr_b32 s31, s21, 16 -; SI-NEXT: s_lshr_b32 s34, s19, 16 -; SI-NEXT: s_lshr_b32 s35, s17, 16 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s15, s76, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: s_and_b32 s15, s17, 0xffff -; SI-NEXT: s_lshl_b32 s16, s35, 16 -; SI-NEXT: s_or_b32 s15, s15, s16 -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_lshl_b32 s15, s74, 16 -; SI-NEXT: s_and_b32 s16, s18, 0xffff -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: s_and_b32 s15, s19, 0xffff -; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s35, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s20, 0xffff -; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s21, 0xffff -; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s31, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s22, 0xffff -; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s24, 0xffff -; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s60, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s25, 0xffff -; SI-NEXT: s_lshl_b32 s16, s95, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s95, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s26, 0xffff -; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s58, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s27, 0xffff -; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s28, 0xffff +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_lshl_b32 s16, s56, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 -; SI-NEXT: s_and_b32 s15, s29, 0xffff -; SI-NEXT: s_lshl_b32 s16, s93, 16 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s93, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: s_lshl_b32 s14, s46, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s12, s12, s15 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 @@ -22345,7 +22657,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -22359,7 +22671,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -22373,12 +22685,12 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s35, v12, 3 -; SI-NEXT: v_readlane_b32 s34, v12, 2 -; SI-NEXT: v_readlane_b32 s31, v12, 1 -; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -22403,16 +22715,44 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v12i64_to_v48i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v18, s23 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v13 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v14 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v15 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v19 +; VI-NEXT: v_readfirstlane_b32 s20, v11 +; VI-NEXT: v_readfirstlane_b32 s19, v12 +; VI-NEXT: v_readfirstlane_b32 s18, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s16, v15 ; VI-NEXT: v_readfirstlane_b32 s15, v0 ; VI-NEXT: v_readfirstlane_b32 s14, v1 ; VI-NEXT: v_readfirstlane_b32 s13, v2 @@ -22426,9 +22766,9 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v9 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -22436,20 +22776,20 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -22462,23 +22802,23 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -22486,107 +22826,107 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s42 ; VI-NEXT: s_lshl_b32 s5, s79, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s78, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s77, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s76, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s75, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s74, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s73, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s72, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s63, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s62, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s61, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s60, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s59, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s58, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s5, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s41, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s41 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s41, s77, 16 +; VI-NEXT: s_or_b32 s40, s40, s41 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s41, s76, 16 +; VI-NEXT: s_or_b32 s26, s26, s41 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s41, s75, 16 +; VI-NEXT: s_or_b32 s25, s25, s41 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s41, s74, 16 +; VI-NEXT: s_or_b32 s24, s24, s41 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s41, s73, 16 +; VI-NEXT: s_or_b32 s23, s23, s41 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s41, s72, 16 +; VI-NEXT: s_or_b32 s22, s22, s41 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s41, s63, 16 +; VI-NEXT: s_or_b32 s21, s21, s41 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s41, s62, 16 +; VI-NEXT: s_or_b32 s20, s20, s41 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s41, s61, 16 +; VI-NEXT: s_or_b32 s19, s19, s41 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s41, s60, 16 +; VI-NEXT: s_or_b32 s18, s18, s41 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s41, s59, 16 +; VI-NEXT: s_or_b32 s17, s17, s41 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s41, s58, 16 +; VI-NEXT: s_or_b32 s16, s16, s41 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s28, s57, 16 -; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_lshl_b32 s41, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s41 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s28, s56, 16 -; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_lshl_b32 s41, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s41 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s47, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s41, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s41 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s41, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s41 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s41, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s41 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s41, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s41 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s41, s43, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s28, s40, 16 -; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s41 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s40 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: v_mov_b32_e32 v4, s25 +; VI-NEXT: v_mov_b32_e32 v5, s24 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s16 ; VI-NEXT: v_mov_b32_e32 v14, s15 ; VI-NEXT: v_mov_b32_e32 v15, s14 ; VI-NEXT: v_mov_b32_e32 v16, s13 @@ -22620,64 +22960,82 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr45 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v12i64_to_v48i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s6, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_readfirstlane_b32 s7, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_readfirstlane_b32 s10, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s11, v16 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_readfirstlane_b32 s15, v11 +; GFX9-NEXT: v_readfirstlane_b32 s16, v12 +; GFX9-NEXT: v_readfirstlane_b32 s17, v13 +; GFX9-NEXT: v_readfirstlane_b32 s18, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v15 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -22692,79 +23050,89 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s40 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB41_4: ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -22796,37 +23164,64 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; GFX11-LABEL: bitcast_v12i64_to_v48i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v12, s17 +; GFX11-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-NEXT: v_dual_mov_b32 v15, s20 :: v_dual_mov_b32 v16, s21 +; GFX11-NEXT: v_dual_mov_b32 v17, s22 :: v_dual_mov_b32 v18, s23 +; GFX11-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v20, s25 +; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-NEXT: v_dual_mov_b32 v23, s28 :: v_dual_mov_b32 v24, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s9, v4 -; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v7 +; GFX11-NEXT: v_readfirstlane_b32 s1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: v_readfirstlane_b32 s8, v15 +; GFX11-NEXT: v_readfirstlane_b32 s9, v16 +; GFX11-NEXT: v_readfirstlane_b32 s10, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v18 +; GFX11-NEXT: v_readfirstlane_b32 s12, v19 +; GFX11-NEXT: v_readfirstlane_b32 s13, v20 +; GFX11-NEXT: v_readfirstlane_b32 s14, v21 +; GFX11-NEXT: v_readfirstlane_b32 s15, v22 +; GFX11-NEXT: v_readfirstlane_b32 s16, v23 +; GFX11-NEXT: v_readfirstlane_b32 s17, v24 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s23, v4 +; GFX11-NEXT: v_readfirstlane_b32 s22, v5 ; GFX11-NEXT: s_mov_b32 s74, 0 -; GFX11-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s24, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -22834,50 +23229,50 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s9, s9, 3 -; GFX11-NEXT: s_addc_u32 s8, s8, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s23, s23, 3 +; GFX11-NEXT: s_addc_u32 s22, s22, 0 ; GFX11-NEXT: s_add_u32 s20, s20, 3 ; GFX11-NEXT: s_addc_u32 s21, s21, 0 ; GFX11-NEXT: s_add_u32 s18, s18, 3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -22888,38 +23283,38 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s72 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s63 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s24 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s23 :: v_dual_mov_b32 v23, s22 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr73 @@ -22940,12 +23335,12 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 ; GFX11-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -23343,7 +23738,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -26139,7 +26534,35 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_readfirstlane_b32 s23, v14 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_readfirstlane_b32 s24, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v12 +; SI-NEXT: v_readfirstlane_b32 s21, v13 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v15 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_readfirstlane_b32 s17, v17 ; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: v_readfirstlane_b32 s15, v2 ; SI-NEXT: v_readfirstlane_b32 s12, v3 @@ -26174,33 +26597,33 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -26213,50 +26636,50 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: s_lshr_b32 s17, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s42, s20, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s44, s22, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_addc_u32 s5, s40, 0 +; SI-NEXT: s_lshr_b32 s22, s4, 16 +; SI-NEXT: s_lshr_b32 s28, s5, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s29, s41, 0 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s29, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s56, s26, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s58, s28, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 +; SI-NEXT: s_addc_u32 s26, s26, 0 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s26, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s58, s16, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_lshr_b32 s60, s14, 16 @@ -26287,18 +26710,18 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 @@ -26323,9 +26746,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 @@ -26551,7 +26974,35 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_mov_b32_e32 v18, s23 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v11 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v13 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v14 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v15 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s24, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v19 +; VI-NEXT: v_readfirstlane_b32 s20, v11 +; VI-NEXT: v_readfirstlane_b32 s19, v12 +; VI-NEXT: v_readfirstlane_b32 s18, v13 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s16, v15 ; VI-NEXT: v_readfirstlane_b32 s15, v0 ; VI-NEXT: v_readfirstlane_b32 s14, v1 ; VI-NEXT: v_readfirstlane_b32 s13, v2 @@ -26565,9 +27016,9 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v9 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -26575,20 +27026,20 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -26601,23 +27052,23 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s40, s7, 16 -; VI-NEXT: s_lshr_b32 s41, s6, 16 -; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s43, s9, 16 ; VI-NEXT: s_lshr_b32 s44, s10, 16 ; VI-NEXT: s_lshr_b32 s45, s11, 16 @@ -26625,107 +27076,107 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s47, s13, 16 ; VI-NEXT: s_lshr_b32 s56, s14, 16 ; VI-NEXT: s_lshr_b32 s57, s15, 16 -; VI-NEXT: s_lshr_b32 s58, s29, 16 -; VI-NEXT: s_lshr_b32 s59, s28, 16 -; VI-NEXT: s_lshr_b32 s60, s27, 16 -; VI-NEXT: s_lshr_b32 s61, s26, 16 -; VI-NEXT: s_lshr_b32 s62, s25, 16 -; VI-NEXT: s_lshr_b32 s63, s24, 16 -; VI-NEXT: s_lshr_b32 s72, s23, 16 -; VI-NEXT: s_lshr_b32 s73, s22, 16 -; VI-NEXT: s_lshr_b32 s74, s21, 16 -; VI-NEXT: s_lshr_b32 s75, s20, 16 -; VI-NEXT: s_lshr_b32 s76, s19, 16 -; VI-NEXT: s_lshr_b32 s77, s18, 16 -; VI-NEXT: s_lshr_b32 s78, s17, 16 -; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s74, s24, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s77, s40, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s42 ; VI-NEXT: s_lshl_b32 s5, s79, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s78, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s77, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s76, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s75, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s74, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s73, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s72, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s63, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s62, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s61, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s60, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s59, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s58, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s5, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s41, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s41 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s41, s77, 16 +; VI-NEXT: s_or_b32 s40, s40, s41 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s41, s76, 16 +; VI-NEXT: s_or_b32 s26, s26, s41 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s41, s75, 16 +; VI-NEXT: s_or_b32 s25, s25, s41 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s41, s74, 16 +; VI-NEXT: s_or_b32 s24, s24, s41 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s41, s73, 16 +; VI-NEXT: s_or_b32 s23, s23, s41 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s41, s72, 16 +; VI-NEXT: s_or_b32 s22, s22, s41 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s41, s63, 16 +; VI-NEXT: s_or_b32 s21, s21, s41 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s41, s62, 16 +; VI-NEXT: s_or_b32 s20, s20, s41 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s41, s61, 16 +; VI-NEXT: s_or_b32 s19, s19, s41 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s41, s60, 16 +; VI-NEXT: s_or_b32 s18, s18, s41 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s41, s59, 16 +; VI-NEXT: s_or_b32 s17, s17, s41 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s41, s58, 16 +; VI-NEXT: s_or_b32 s16, s16, s41 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s28, s57, 16 -; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_lshl_b32 s41, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s41 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s28, s56, 16 -; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_lshl_b32 s41, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s41 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s28, s47, 16 -; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_lshl_b32 s41, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s41 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s28, s46, 16 -; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_lshl_b32 s41, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s41 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s28, s45, 16 -; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_lshl_b32 s41, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s41 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s28, s44, 16 -; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_lshl_b32 s41, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s41 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s28, s43, 16 -; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_lshl_b32 s41, s43, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s28, s42, 16 -; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s28, s41, 16 -; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s28, s40, 16 -; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s41 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s40 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: v_mov_b32_e32 v4, s25 +; VI-NEXT: v_mov_b32_e32 v5, s24 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v12, s17 +; VI-NEXT: v_mov_b32_e32 v13, s16 ; VI-NEXT: v_mov_b32_e32 v14, s15 ; VI-NEXT: v_mov_b32_e32 v15, s14 ; VI-NEXT: v_mov_b32_e32 v16, s13 @@ -26759,64 +27210,82 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr45 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v12i64_to_v48f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mov_b32_e32 v13, s18 +; GFX9-NEXT: v_mov_b32_e32 v14, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s6, v11 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_readfirstlane_b32 s7, v12 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_readfirstlane_b32 s10, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s11, v16 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_readfirstlane_b32 s15, v11 +; GFX9-NEXT: v_readfirstlane_b32 s16, v12 +; GFX9-NEXT: v_readfirstlane_b32 s17, v13 +; GFX9-NEXT: v_readfirstlane_b32 s18, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v15 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -26831,79 +27300,89 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s40, s15, 16 -; GFX9-NEXT: s_lshr_b32 s41, s14, 16 -; GFX9-NEXT: s_lshr_b32 s42, s13, 16 -; GFX9-NEXT: s_lshr_b32 s43, s12, 16 -; GFX9-NEXT: s_lshr_b32 s44, s11, 16 -; GFX9-NEXT: s_lshr_b32 s45, s10, 16 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: s_lshr_b32 s47, s8, 16 -; GFX9-NEXT: s_lshr_b32 s56, s7, 16 -; GFX9-NEXT: s_lshr_b32 s57, s6, 16 -; GFX9-NEXT: s_lshr_b32 s58, s29, 16 -; GFX9-NEXT: s_lshr_b32 s59, s28, 16 -; GFX9-NEXT: s_lshr_b32 s60, s27, 16 -; GFX9-NEXT: s_lshr_b32 s61, s26, 16 -; GFX9-NEXT: s_lshr_b32 s62, s25, 16 -; GFX9-NEXT: s_lshr_b32 s63, s24, 16 -; GFX9-NEXT: s_lshr_b32 s72, s23, 16 -; GFX9-NEXT: s_lshr_b32 s73, s22, 16 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s20, 16 -; GFX9-NEXT: s_lshr_b32 s76, s19, 16 -; GFX9-NEXT: s_lshr_b32 s77, s18, 16 -; GFX9-NEXT: s_lshr_b32 s78, s17, 16 -; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s44, s25, 16 +; GFX9-NEXT: s_lshr_b32 s45, s24, 16 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s19, 16 +; GFX9-NEXT: s_lshr_b32 s59, s18, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: s_lshr_b32 s61, s16, 16 +; GFX9-NEXT: s_lshr_b32 s62, s15, 16 +; GFX9-NEXT: s_lshr_b32 s63, s14, 16 +; GFX9-NEXT: s_lshr_b32 s72, s13, 16 +; GFX9-NEXT: s_lshr_b32 s73, s12, 16 +; GFX9-NEXT: s_lshr_b32 s74, s11, 16 +; GFX9-NEXT: s_lshr_b32 s75, s10, 16 +; GFX9-NEXT: s_lshr_b32 s76, s9, 16 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshr_b32 s78, s7, 16 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s40 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB45_4: ; GFX9-NEXT: ; implicit-def: $sgpr79 @@ -26935,37 +27414,64 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; GFX11-LABEL: bitcast_v12i64_to_v48f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v12, s17 +; GFX11-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-NEXT: v_dual_mov_b32 v15, s20 :: v_dual_mov_b32 v16, s21 +; GFX11-NEXT: v_dual_mov_b32 v17, s22 :: v_dual_mov_b32 v18, s23 +; GFX11-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v20, s25 +; GFX11-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-NEXT: v_dual_mov_b32 v23, s28 :: v_dual_mov_b32 v24, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s9, v4 -; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v7 +; GFX11-NEXT: v_readfirstlane_b32 s1, v8 +; GFX11-NEXT: v_readfirstlane_b32 s2, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: v_readfirstlane_b32 s8, v15 +; GFX11-NEXT: v_readfirstlane_b32 s9, v16 +; GFX11-NEXT: v_readfirstlane_b32 s10, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v18 +; GFX11-NEXT: v_readfirstlane_b32 s12, v19 +; GFX11-NEXT: v_readfirstlane_b32 s13, v20 +; GFX11-NEXT: v_readfirstlane_b32 s14, v21 +; GFX11-NEXT: v_readfirstlane_b32 s15, v22 +; GFX11-NEXT: v_readfirstlane_b32 s16, v23 +; GFX11-NEXT: v_readfirstlane_b32 s17, v24 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s23, v4 +; GFX11-NEXT: v_readfirstlane_b32 s22, v5 ; GFX11-NEXT: s_mov_b32 s74, 0 -; GFX11-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s24, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -26973,50 +27479,50 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s9, s9, 3 -; GFX11-NEXT: s_addc_u32 s8, s8, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 -; GFX11-NEXT: s_add_u32 s22, s22, 3 -; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s23, s23, 3 +; GFX11-NEXT: s_addc_u32 s22, s22, 0 ; GFX11-NEXT: s_add_u32 s20, s20, 3 ; GFX11-NEXT: s_addc_u32 s21, s21, 0 ; GFX11-NEXT: s_add_u32 s18, s18, 3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s10, s8, 16 -; GFX11-NEXT: s_lshr_b32 s11, s9, 16 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: s_lshr_b32 s13, s6, 16 -; GFX11-NEXT: s_lshr_b32 s14, s5, 16 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_lshr_b32 s40, s29, 16 -; GFX11-NEXT: s_lshr_b32 s41, s28, 16 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: s_lshr_b32 s44, s25, 16 -; GFX11-NEXT: s_lshr_b32 s45, s24, 16 -; GFX11-NEXT: s_lshr_b32 s46, s23, 16 -; GFX11-NEXT: s_lshr_b32 s47, s22, 16 -; GFX11-NEXT: s_lshr_b32 s56, s21, 16 -; GFX11-NEXT: s_lshr_b32 s57, s20, 16 -; GFX11-NEXT: s_lshr_b32 s58, s19, 16 -; GFX11-NEXT: s_lshr_b32 s59, s18, 16 -; GFX11-NEXT: s_lshr_b32 s60, s17, 16 -; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s24, s22, 16 +; GFX11-NEXT: s_lshr_b32 s25, s23, 16 +; GFX11-NEXT: s_lshr_b32 s26, s21, 16 +; GFX11-NEXT: s_lshr_b32 s27, s20, 16 +; GFX11-NEXT: s_lshr_b32 s28, s19, 16 +; GFX11-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s43, s14, 16 +; GFX11-NEXT: s_lshr_b32 s44, s13, 16 +; GFX11-NEXT: s_lshr_b32 s45, s12, 16 +; GFX11-NEXT: s_lshr_b32 s46, s11, 16 +; GFX11-NEXT: s_lshr_b32 s47, s10, 16 +; GFX11-NEXT: s_lshr_b32 s56, s9, 16 +; GFX11-NEXT: s_lshr_b32 s57, s8, 16 +; GFX11-NEXT: s_lshr_b32 s58, s7, 16 +; GFX11-NEXT: s_lshr_b32 s59, s6, 16 +; GFX11-NEXT: s_lshr_b32 s60, s5, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: s_lshr_b32 s62, s3, 16 ; GFX11-NEXT: s_lshr_b32 s63, s2, 16 ; GFX11-NEXT: s_lshr_b32 s72, s1, 16 @@ -27027,38 +27533,38 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s72 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s63 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s26 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s25 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s24 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s23 :: v_dual_mov_b32 v23, s22 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr73 @@ -27079,12 +27585,12 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: ; implicit-def: $sgpr24 ; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -30514,8 +31020,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v24, s17 ; VI-NEXT: v_mov_b32_e32 v19, s18 ; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 ; VI-NEXT: v_mov_b32_e32 v11, s22 ; VI-NEXT: v_mov_b32_e32 v12, s23 ; VI-NEXT: v_mov_b32_e32 v21, s24 @@ -30523,8 +31029,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v17, s26 ; VI-NEXT: v_mov_b32_e32 v18, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -30537,16 +31043,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -30558,11 +31064,11 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -30575,16 +31081,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -30599,26 +31105,26 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v13, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 ; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -30684,8 +31190,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v24, s17 ; GFX9-NEXT: v_mov_b32_e32 v19, s18 ; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 ; GFX9-NEXT: v_mov_b32_e32 v11, s22 ; GFX9-NEXT: v_mov_b32_e32 v12, s23 ; GFX9-NEXT: v_mov_b32_e32 v21, s24 @@ -30693,8 +31199,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v17, s26 ; GFX9-NEXT: v_mov_b32_e32 v18, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -30707,16 +31213,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -30728,11 +31234,11 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -30745,42 +31251,42 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -31553,7 +32059,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -34266,430 +34772,489 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_and_b64 s[14:15], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB53_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s14, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: s_lshr_b32 s14, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 -; SI-NEXT: s_lshr_b32 s14, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 -; SI-NEXT: s_lshr_b32 s14, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 -; SI-NEXT: s_lshr_b32 s14, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: s_lshr_b32 s14, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 -; SI-NEXT: s_lshr_b32 s14, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 -; SI-NEXT: s_lshr_b32 s14, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: s_lshr_b32 s14, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: s_lshr_b32 s14, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: s_lshr_b32 s14, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s14 -; SI-NEXT: s_lshr_b32 s14, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: s_lshr_b32 s14, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: s_lshr_b32 s14, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: s_lshr_b32 s14, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 -; SI-NEXT: s_lshr_b32 s14, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[50:51], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[33:34], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[29:30], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[22:23], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: @@ -34700,8 +35265,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v24, s17 ; VI-NEXT: v_mov_b32_e32 v19, s18 ; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 ; VI-NEXT: v_mov_b32_e32 v11, s22 ; VI-NEXT: v_mov_b32_e32 v12, s23 ; VI-NEXT: v_mov_b32_e32 v21, s24 @@ -34709,8 +35274,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v17, s26 ; VI-NEXT: v_mov_b32_e32 v18, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -34723,16 +35288,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -34744,11 +35309,11 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -34761,16 +35326,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -34785,26 +35350,26 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v13, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 ; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34870,8 +35435,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v24, s17 ; GFX9-NEXT: v_mov_b32_e32 v19, s18 ; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s20 -; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 ; GFX9-NEXT: v_mov_b32_e32 v11, s22 ; GFX9-NEXT: v_mov_b32_e32 v12, s23 ; GFX9-NEXT: v_mov_b32_e32 v21, s24 @@ -34879,8 +35444,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v17, s26 ; GFX9-NEXT: v_mov_b32_e32 v18, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -34893,16 +35458,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 @@ -34914,11 +35479,11 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 @@ -34931,42 +35496,42 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index c7a199328012d..93690270fd797 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -3409,18 +3409,46 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v14, s30, 0 -; SI-NEXT: v_writelane_b32 v14, s31, 1 -; SI-NEXT: v_writelane_b32 v14, s34, 2 -; SI-NEXT: v_writelane_b32 v14, s35, 3 -; SI-NEXT: v_writelane_b32 v14, s36, 4 -; SI-NEXT: v_writelane_b32 v14, s37, 5 -; SI-NEXT: v_writelane_b32 v14, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v15 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_writelane_b32 v14, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v19 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 ; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: v_readfirstlane_b32 s15, v2 ; SI-NEXT: v_readfirstlane_b32 s12, v3 @@ -3432,9 +3460,9 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: v_readfirstlane_b32 s7, v10 ; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: v_writelane_b32 v14, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s92, s5, 16 @@ -3443,42 +3471,42 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s95, s11, 16 ; SI-NEXT: s_lshr_b32 s30, s13, 16 ; SI-NEXT: s_lshr_b32 s31, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s29, 16 -; SI-NEXT: s_lshr_b32 s35, s27, 16 -; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 ; SI-NEXT: s_lshr_b32 s37, s23, 16 -; SI-NEXT: s_lshr_b32 s38, s21, 16 -; SI-NEXT: s_lshr_b32 s39, s19, 16 -; SI-NEXT: s_lshr_b32 s48, s17, 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -3491,118 +3519,118 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 ; SI-NEXT: s_lshr_b32 s92, s5, 16 ; SI-NEXT: s_lshr_b32 s93, s7, 16 ; SI-NEXT: s_lshr_b32 s94, s9, 16 ; SI-NEXT: s_lshr_b32 s95, s11, 16 ; SI-NEXT: s_lshr_b32 s30, s13, 16 ; SI-NEXT: s_lshr_b32 s31, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s29, 16 -; SI-NEXT: s_lshr_b32 s35, s27, 16 -; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 ; SI-NEXT: s_lshr_b32 s37, s23, 16 -; SI-NEXT: s_lshr_b32 s38, s21, 16 -; SI-NEXT: s_lshr_b32 s39, s19, 16 -; SI-NEXT: s_lshr_b32 s48, s17, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s41, s88, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s41 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s78, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s76, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s48, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s39, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: s_lshl_b32 s25, s38, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s74, 16 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s37, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s36, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s35, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s60, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -3666,7 +3694,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s42, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3680,7 +3708,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s40, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3694,17 +3722,17 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s48, v14, 8 -; SI-NEXT: v_readlane_b32 s39, v14, 7 -; SI-NEXT: v_readlane_b32 s38, v14, 6 -; SI-NEXT: v_readlane_b32 s37, v14, 5 -; SI-NEXT: v_readlane_b32 s36, v14, 4 -; SI-NEXT: v_readlane_b32 s35, v14, 3 -; SI-NEXT: v_readlane_b32 s34, v14, 2 -; SI-NEXT: v_readlane_b32 s31, v14, 1 -; SI-NEXT: v_readlane_b32 s30, v14, 0 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3733,16 +3761,44 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v26i32_to_v52i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_mov_b32_e32 v13, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v15 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v16 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v17 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v18 +; VI-NEXT: v_mov_b32_e32 v18, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v19 +; VI-NEXT: v_mov_b32_e32 v19, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v0 -; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s22, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v17 +; VI-NEXT: v_readfirstlane_b32 s19, v18 +; VI-NEXT: v_readfirstlane_b32 s18, v19 +; VI-NEXT: v_readfirstlane_b32 s17, v0 +; VI-NEXT: v_readfirstlane_b32 s16, v1 ; VI-NEXT: v_readfirstlane_b32 s15, v2 ; VI-NEXT: v_readfirstlane_b32 s14, v3 ; VI-NEXT: v_readfirstlane_b32 s13, v4 @@ -3756,9 +3812,9 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v11 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -3766,22 +3822,22 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -3794,25 +3850,25 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -3820,117 +3876,117 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s44 ; VI-NEXT: s_lshl_b32 s5, s91, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s90, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s89, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s88, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s79, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s78, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s77, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s76, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s75, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s74, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s73, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s72, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s63, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s62, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s29, s61, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s40, s60, 16 -; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s5, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s43, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s43 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s43, s89, 16 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s43, s88, 16 +; VI-NEXT: s_or_b32 s41, s41, s43 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s43, s79, 16 +; VI-NEXT: s_or_b32 s40, s40, s43 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s43, s78, 16 +; VI-NEXT: s_or_b32 s26, s26, s43 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s43, s77, 16 +; VI-NEXT: s_or_b32 s25, s25, s43 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s43, s76, 16 +; VI-NEXT: s_or_b32 s24, s24, s43 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s43, s75, 16 +; VI-NEXT: s_or_b32 s23, s23, s43 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s43, s74, 16 +; VI-NEXT: s_or_b32 s22, s22, s43 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s43, s73, 16 +; VI-NEXT: s_or_b32 s21, s21, s43 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s43, s72, 16 +; VI-NEXT: s_or_b32 s20, s20, s43 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s43, s63, 16 +; VI-NEXT: s_or_b32 s19, s19, s43 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s43, s62, 16 +; VI-NEXT: s_or_b32 s18, s18, s43 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s43, s61, 16 +; VI-NEXT: s_or_b32 s17, s17, s43 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s43, s60, 16 +; VI-NEXT: s_or_b32 s16, s16, s43 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s40, s59, 16 -; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_lshl_b32 s43, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s43 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s40, s58, 16 -; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_lshl_b32 s43, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s43 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s40, s57, 16 -; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_lshl_b32 s43, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s43 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s40, s56, 16 -; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_lshl_b32 s43, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s43 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s40, s47, 16 -; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_lshl_b32 s43, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s43 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s40, s46, 16 -; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_lshl_b32 s43, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s43 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s40, s45, 16 -; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_lshl_b32 s43, s45, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s40, s44, 16 -; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s40, s43, 16 -; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s40, s42, 16 -; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s43 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_mov_b32_e32 v3, s41 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: v_mov_b32_e32 v16, s15 ; VI-NEXT: v_mov_b32_e32 v17, s14 ; VI-NEXT: v_mov_b32_e32 v18, s13 @@ -3966,25 +4022,53 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v26i32_to_v52i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v19, s22 +; GFX9-NEXT: v_readfirstlane_b32 s6, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s23 +; GFX9-NEXT: v_readfirstlane_b32 s7, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_readfirstlane_b32 s10, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_readfirstlane_b32 s11, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s28 +; GFX9-NEXT: v_readfirstlane_b32 s12, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: v_readfirstlane_b32 s16, v16 +; GFX9-NEXT: v_readfirstlane_b32 s17, v17 +; GFX9-NEXT: v_readfirstlane_b32 s18, v18 +; GFX9-NEXT: v_readfirstlane_b32 s19, v19 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 @@ -3992,44 +4076,34 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -4044,83 +4118,93 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s44 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4156,41 +4240,68 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX11-LABEL: bitcast_v26i32_to_v52i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v14, s17 +; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v16, s19 +; GFX11-NEXT: v_dual_mov_b32 v17, s20 :: v_dual_mov_b32 v18, s21 +; GFX11-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-NEXT: v_dual_mov_b32 v21, s24 :: v_dual_mov_b32 v22, s25 +; GFX11-NEXT: v_dual_mov_b32 v23, s26 :: v_dual_mov_b32 v24, s27 +; GFX11-NEXT: v_dual_mov_b32 v25, s28 :: v_dual_mov_b32 v26, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s11, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: v_readfirstlane_b32 s6, v15 +; GFX11-NEXT: v_readfirstlane_b32 s7, v16 +; GFX11-NEXT: v_readfirstlane_b32 s8, v17 +; GFX11-NEXT: v_readfirstlane_b32 s9, v18 +; GFX11-NEXT: v_readfirstlane_b32 s10, v19 +; GFX11-NEXT: v_readfirstlane_b32 s11, v20 +; GFX11-NEXT: v_readfirstlane_b32 s12, v21 +; GFX11-NEXT: v_readfirstlane_b32 s13, v22 +; GFX11-NEXT: v_readfirstlane_b32 s14, v23 +; GFX11-NEXT: v_readfirstlane_b32 s15, v24 +; GFX11-NEXT: v_readfirstlane_b32 s16, v25 +; GFX11-NEXT: v_readfirstlane_b32 s17, v26 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s25, v6 +; GFX11-NEXT: v_readfirstlane_b32 s24, v7 ; GFX11-NEXT: s_mov_b32 s78, 0 -; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s26, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -4198,20 +4309,8 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s22, s22, 3 ; GFX11-NEXT: s_add_i32 s21, s21, 3 @@ -4220,32 +4319,44 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -4256,41 +4367,41 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s76 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s75 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s26 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s25 :: v_dual_mov_b32 v25, s24 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr77 @@ -4315,10 +4426,10 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 ; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4761,7 +4872,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -7827,9 +7938,37 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s40, v14 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s41, v1 -; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s29, v16 +; SI-NEXT: v_readfirstlane_b32 s22, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s20, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 ; SI-NEXT: v_readfirstlane_b32 s15, v3 ; SI-NEXT: v_readfirstlane_b32 s14, v4 ; SI-NEXT: v_readfirstlane_b32 s13, v5 @@ -7868,39 +8007,39 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -7913,31 +8052,26 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 @@ -7945,8 +8079,13 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -7957,22 +8096,22 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s42, s18, 16 -; SI-NEXT: s_lshr_b32 s43, s19, 16 -; SI-NEXT: s_lshr_b32 s44, s20, 16 -; SI-NEXT: s_lshr_b32 s45, s21, 16 -; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: s_lshr_b32 s47, s23, 16 -; SI-NEXT: s_lshr_b32 s56, s24, 16 -; SI-NEXT: s_lshr_b32 s57, s25, 16 -; SI-NEXT: s_lshr_b32 s58, s26, 16 -; SI-NEXT: s_lshr_b32 s59, s27, 16 -; SI-NEXT: s_lshr_b32 s60, s28, 16 -; SI-NEXT: s_lshr_b32 s61, s29, 16 -; SI-NEXT: s_lshr_b32 s62, s41, 16 -; SI-NEXT: s_lshr_b32 s63, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s5, s41, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s26, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s28, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s22, 16 +; SI-NEXT: s_lshr_b32 s58, s21, 16 +; SI-NEXT: s_lshr_b32 s59, s20, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 16 +; SI-NEXT: s_lshr_b32 s61, s18, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s16, 16 ; SI-NEXT: s_lshr_b32 s72, s15, 16 ; SI-NEXT: s_lshr_b32 s73, s14, 16 ; SI-NEXT: s_lshr_b32 s74, s13, 16 @@ -7993,24 +8132,24 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 @@ -8285,9 +8424,37 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_mov_b32_e32 v13, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v15 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v16 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v17 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v18 +; VI-NEXT: v_mov_b32_e32 v18, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v19 +; VI-NEXT: v_mov_b32_e32 v19, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v0 -; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s22, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v17 +; VI-NEXT: v_readfirstlane_b32 s19, v18 +; VI-NEXT: v_readfirstlane_b32 s18, v19 +; VI-NEXT: v_readfirstlane_b32 s17, v0 +; VI-NEXT: v_readfirstlane_b32 s16, v1 ; VI-NEXT: v_readfirstlane_b32 s15, v2 ; VI-NEXT: v_readfirstlane_b32 s14, v3 ; VI-NEXT: v_readfirstlane_b32 s13, v4 @@ -8301,9 +8468,9 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v11 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -8311,22 +8478,22 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -8339,25 +8506,25 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -8365,117 +8532,117 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s44 ; VI-NEXT: s_lshl_b32 s5, s91, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s90, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s89, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s88, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s79, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s78, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s77, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s76, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s75, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s74, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s73, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s72, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s63, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s62, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s29, s61, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s40, s60, 16 -; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s5, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s43, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s43 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s43, s89, 16 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s43, s88, 16 +; VI-NEXT: s_or_b32 s41, s41, s43 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s43, s79, 16 +; VI-NEXT: s_or_b32 s40, s40, s43 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s43, s78, 16 +; VI-NEXT: s_or_b32 s26, s26, s43 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s43, s77, 16 +; VI-NEXT: s_or_b32 s25, s25, s43 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s43, s76, 16 +; VI-NEXT: s_or_b32 s24, s24, s43 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s43, s75, 16 +; VI-NEXT: s_or_b32 s23, s23, s43 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s43, s74, 16 +; VI-NEXT: s_or_b32 s22, s22, s43 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s43, s73, 16 +; VI-NEXT: s_or_b32 s21, s21, s43 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s43, s72, 16 +; VI-NEXT: s_or_b32 s20, s20, s43 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s43, s63, 16 +; VI-NEXT: s_or_b32 s19, s19, s43 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s43, s62, 16 +; VI-NEXT: s_or_b32 s18, s18, s43 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s43, s61, 16 +; VI-NEXT: s_or_b32 s17, s17, s43 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s43, s60, 16 +; VI-NEXT: s_or_b32 s16, s16, s43 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s40, s59, 16 -; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_lshl_b32 s43, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s43 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s40, s58, 16 -; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_lshl_b32 s43, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s43 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s40, s57, 16 -; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_lshl_b32 s43, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s43 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s40, s56, 16 -; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_lshl_b32 s43, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s43 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s40, s47, 16 -; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_lshl_b32 s43, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s43 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s40, s46, 16 -; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_lshl_b32 s43, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s43 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s40, s45, 16 -; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_lshl_b32 s43, s45, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s40, s44, 16 -; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s40, s43, 16 -; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s40, s42, 16 -; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s43 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_mov_b32_e32 v3, s41 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: v_mov_b32_e32 v16, s15 ; VI-NEXT: v_mov_b32_e32 v17, s14 ; VI-NEXT: v_mov_b32_e32 v18, s13 @@ -8511,25 +8678,53 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v26i32_to_v52f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v19, s22 +; GFX9-NEXT: v_readfirstlane_b32 s6, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s23 +; GFX9-NEXT: v_readfirstlane_b32 s7, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_readfirstlane_b32 s10, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_readfirstlane_b32 s11, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s28 +; GFX9-NEXT: v_readfirstlane_b32 s12, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: v_readfirstlane_b32 s16, v16 +; GFX9-NEXT: v_readfirstlane_b32 s17, v17 +; GFX9-NEXT: v_readfirstlane_b32 s18, v18 +; GFX9-NEXT: v_readfirstlane_b32 s19, v19 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 @@ -8537,44 +8732,34 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -8589,83 +8774,93 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s44 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8701,41 +8896,68 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX11-LABEL: bitcast_v26i32_to_v52f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v14, s17 +; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v16, s19 +; GFX11-NEXT: v_dual_mov_b32 v17, s20 :: v_dual_mov_b32 v18, s21 +; GFX11-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-NEXT: v_dual_mov_b32 v21, s24 :: v_dual_mov_b32 v22, s25 +; GFX11-NEXT: v_dual_mov_b32 v23, s26 :: v_dual_mov_b32 v24, s27 +; GFX11-NEXT: v_dual_mov_b32 v25, s28 :: v_dual_mov_b32 v26, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s11, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: v_readfirstlane_b32 s6, v15 +; GFX11-NEXT: v_readfirstlane_b32 s7, v16 +; GFX11-NEXT: v_readfirstlane_b32 s8, v17 +; GFX11-NEXT: v_readfirstlane_b32 s9, v18 +; GFX11-NEXT: v_readfirstlane_b32 s10, v19 +; GFX11-NEXT: v_readfirstlane_b32 s11, v20 +; GFX11-NEXT: v_readfirstlane_b32 s12, v21 +; GFX11-NEXT: v_readfirstlane_b32 s13, v22 +; GFX11-NEXT: v_readfirstlane_b32 s14, v23 +; GFX11-NEXT: v_readfirstlane_b32 s15, v24 +; GFX11-NEXT: v_readfirstlane_b32 s16, v25 +; GFX11-NEXT: v_readfirstlane_b32 s17, v26 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s25, v6 +; GFX11-NEXT: v_readfirstlane_b32 s24, v7 ; GFX11-NEXT: s_mov_b32 s78, 0 -; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s26, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -8743,20 +8965,8 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 -; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s23, s23, 3 ; GFX11-NEXT: s_add_i32 s22, s22, 3 ; GFX11-NEXT: s_add_i32 s21, s21, 3 @@ -8765,32 +8975,44 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -8801,41 +9023,41 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s76 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s75 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s26 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s25 :: v_dual_mov_b32 v25, s24 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr77 @@ -8860,10 +9082,10 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 ; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15485,7 +15707,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -18526,19 +18748,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s41, v1 -; SI-NEXT: v_readfirstlane_b32 s40, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -18555,377 +18764,468 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v24, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 -; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v30, v41, v30 -; SI-NEXT: buffer_store_dword v30, v4, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v55, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 -; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v52 -; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v49, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v11 -; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 -; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: buffer_store_dword v4, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v28, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v26, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v24, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v22, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -18949,58 +19249,73 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: @@ -24069,18 +24384,46 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v14, s30, 0 -; SI-NEXT: v_writelane_b32 v14, s31, 1 -; SI-NEXT: v_writelane_b32 v14, s34, 2 -; SI-NEXT: v_writelane_b32 v14, s35, 3 -; SI-NEXT: v_writelane_b32 v14, s36, 4 -; SI-NEXT: v_writelane_b32 v14, s37, 5 -; SI-NEXT: v_writelane_b32 v14, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v15 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_writelane_b32 v14, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v19 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 ; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: v_readfirstlane_b32 s15, v2 ; SI-NEXT: v_readfirstlane_b32 s12, v3 @@ -24092,9 +24435,9 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: v_readfirstlane_b32 s7, v10 ; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: v_writelane_b32 v14, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s92, s5, 16 @@ -24103,26 +24446,26 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s95, s11, 16 ; SI-NEXT: s_lshr_b32 s30, s13, 16 ; SI-NEXT: s_lshr_b32 s31, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s29, 16 -; SI-NEXT: s_lshr_b32 s35, s27, 16 -; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 ; SI-NEXT: s_lshr_b32 s37, s23, 16 -; SI-NEXT: s_lshr_b32 s38, s21, 16 -; SI-NEXT: s_lshr_b32 s39, s19, 16 -; SI-NEXT: s_lshr_b32 s48, s17, 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -24137,132 +24480,132 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 ; SI-NEXT: s_lshr_b32 s92, s5, 16 ; SI-NEXT: s_lshr_b32 s93, s7, 16 ; SI-NEXT: s_lshr_b32 s94, s9, 16 ; SI-NEXT: s_lshr_b32 s95, s11, 16 ; SI-NEXT: s_lshr_b32 s30, s13, 16 ; SI-NEXT: s_lshr_b32 s31, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s29, 16 -; SI-NEXT: s_lshr_b32 s35, s27, 16 -; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 ; SI-NEXT: s_lshr_b32 s37, s23, 16 -; SI-NEXT: s_lshr_b32 s38, s21, 16 -; SI-NEXT: s_lshr_b32 s39, s19, 16 -; SI-NEXT: s_lshr_b32 s48, s17, 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s41, s88, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s41 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s78, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s76, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s48, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s39, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: s_lshl_b32 s25, s38, 16 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s74, 16 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s37, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s36, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s35, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s60, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -24326,7 +24669,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s42, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -24340,7 +24683,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s40, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -24354,17 +24697,17 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s48, v14, 8 -; SI-NEXT: v_readlane_b32 s39, v14, 7 -; SI-NEXT: v_readlane_b32 s38, v14, 6 -; SI-NEXT: v_readlane_b32 s37, v14, 5 -; SI-NEXT: v_readlane_b32 s36, v14, 4 -; SI-NEXT: v_readlane_b32 s35, v14, 3 -; SI-NEXT: v_readlane_b32 s34, v14, 2 -; SI-NEXT: v_readlane_b32 s31, v14, 1 -; SI-NEXT: v_readlane_b32 s30, v14, 0 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -24393,16 +24736,44 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v13i64_to_v52i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_mov_b32_e32 v13, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v15 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v16 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v17 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v18 +; VI-NEXT: v_mov_b32_e32 v18, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v19 +; VI-NEXT: v_mov_b32_e32 v19, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v0 -; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s22, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v17 +; VI-NEXT: v_readfirstlane_b32 s19, v18 +; VI-NEXT: v_readfirstlane_b32 s18, v19 +; VI-NEXT: v_readfirstlane_b32 s17, v0 +; VI-NEXT: v_readfirstlane_b32 s16, v1 ; VI-NEXT: v_readfirstlane_b32 s15, v2 ; VI-NEXT: v_readfirstlane_b32 s14, v3 ; VI-NEXT: v_readfirstlane_b32 s13, v4 @@ -24416,9 +24787,9 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s7, v11 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -24426,22 +24797,22 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -24454,25 +24825,25 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -24480,117 +24851,117 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s44 ; VI-NEXT: s_lshl_b32 s5, s91, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s90, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s89, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s88, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s79, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s78, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s77, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s76, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s75, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s74, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s73, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s72, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s63, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s62, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s29, s61, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s40, s60, 16 -; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s5, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s43, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s43 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s43, s89, 16 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s43, s88, 16 +; VI-NEXT: s_or_b32 s41, s41, s43 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s43, s79, 16 +; VI-NEXT: s_or_b32 s40, s40, s43 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s43, s78, 16 +; VI-NEXT: s_or_b32 s26, s26, s43 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s43, s77, 16 +; VI-NEXT: s_or_b32 s25, s25, s43 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s43, s76, 16 +; VI-NEXT: s_or_b32 s24, s24, s43 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s43, s75, 16 +; VI-NEXT: s_or_b32 s23, s23, s43 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s43, s74, 16 +; VI-NEXT: s_or_b32 s22, s22, s43 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s43, s73, 16 +; VI-NEXT: s_or_b32 s21, s21, s43 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s43, s72, 16 +; VI-NEXT: s_or_b32 s20, s20, s43 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s43, s63, 16 +; VI-NEXT: s_or_b32 s19, s19, s43 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s43, s62, 16 +; VI-NEXT: s_or_b32 s18, s18, s43 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s43, s61, 16 +; VI-NEXT: s_or_b32 s17, s17, s43 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s43, s60, 16 +; VI-NEXT: s_or_b32 s16, s16, s43 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s40, s59, 16 -; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_lshl_b32 s43, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s43 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s40, s58, 16 -; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_lshl_b32 s43, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s43 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s40, s57, 16 -; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_lshl_b32 s43, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s43 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s40, s56, 16 -; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_lshl_b32 s43, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s43 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s40, s47, 16 -; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_lshl_b32 s43, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s43 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s40, s46, 16 -; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_lshl_b32 s43, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s43 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s40, s45, 16 -; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_lshl_b32 s43, s45, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s40, s44, 16 -; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s40, s43, 16 -; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s40, s42, 16 -; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s43 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_mov_b32_e32 v3, s41 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: v_mov_b32_e32 v16, s15 ; VI-NEXT: v_mov_b32_e32 v17, s14 ; VI-NEXT: v_mov_b32_e32 v18, s13 @@ -24626,25 +24997,53 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v13i64_to_v52i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v19, s22 +; GFX9-NEXT: v_readfirstlane_b32 s6, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s23 +; GFX9-NEXT: v_readfirstlane_b32 s7, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_readfirstlane_b32 s10, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_readfirstlane_b32 s11, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s28 +; GFX9-NEXT: v_readfirstlane_b32 s12, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: v_readfirstlane_b32 s16, v16 +; GFX9-NEXT: v_readfirstlane_b32 s17, v17 +; GFX9-NEXT: v_readfirstlane_b32 s18, v18 +; GFX9-NEXT: v_readfirstlane_b32 s19, v19 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 @@ -24652,44 +25051,34 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -24704,83 +25093,93 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s44 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -24816,41 +25215,68 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX11-LABEL: bitcast_v13i64_to_v52i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v14, s17 +; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v16, s19 +; GFX11-NEXT: v_dual_mov_b32 v17, s20 :: v_dual_mov_b32 v18, s21 +; GFX11-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-NEXT: v_dual_mov_b32 v21, s24 :: v_dual_mov_b32 v22, s25 +; GFX11-NEXT: v_dual_mov_b32 v23, s26 :: v_dual_mov_b32 v24, s27 +; GFX11-NEXT: v_dual_mov_b32 v25, s28 :: v_dual_mov_b32 v26, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s11, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: v_readfirstlane_b32 s6, v15 +; GFX11-NEXT: v_readfirstlane_b32 s7, v16 +; GFX11-NEXT: v_readfirstlane_b32 s8, v17 +; GFX11-NEXT: v_readfirstlane_b32 s9, v18 +; GFX11-NEXT: v_readfirstlane_b32 s10, v19 +; GFX11-NEXT: v_readfirstlane_b32 s11, v20 +; GFX11-NEXT: v_readfirstlane_b32 s12, v21 +; GFX11-NEXT: v_readfirstlane_b32 s13, v22 +; GFX11-NEXT: v_readfirstlane_b32 s14, v23 +; GFX11-NEXT: v_readfirstlane_b32 s15, v24 +; GFX11-NEXT: v_readfirstlane_b32 s16, v25 +; GFX11-NEXT: v_readfirstlane_b32 s17, v26 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s25, v6 +; GFX11-NEXT: v_readfirstlane_b32 s24, v7 ; GFX11-NEXT: s_mov_b32 s78, 0 -; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s26, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -24858,20 +25284,8 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s11, s11, 3 -; GFX11-NEXT: s_addc_u32 s10, s10, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s25, s25, 3 +; GFX11-NEXT: s_addc_u32 s24, s24, 0 ; GFX11-NEXT: s_add_u32 s22, s22, 3 ; GFX11-NEXT: s_addc_u32 s23, s23, 0 ; GFX11-NEXT: s_add_u32 s20, s20, 3 @@ -24880,32 +25294,44 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -24916,41 +25342,41 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s76 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s75 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s26 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s25 :: v_dual_mov_b32 v25, s24 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr77 @@ -24975,10 +25401,10 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 ; GFX11-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -25421,7 +25847,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -28502,9 +28928,37 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s40, v14 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s40, v1 -; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_readfirstlane_b32 s26, v17 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v15 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s14, v3 ; SI-NEXT: v_readfirstlane_b32 s15, v4 ; SI-NEXT: v_readfirstlane_b32 s12, v5 @@ -28543,39 +28997,39 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -28588,56 +29042,56 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: s_lshr_b32 s17, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s42, s18, 16 -; SI-NEXT: s_lshr_b32 s43, s19, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s44, s20, 16 -; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_add_u32 s4, s40, 3 +; SI-NEXT: s_addc_u32 s5, s41, 0 +; SI-NEXT: s_lshr_b32 s28, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 ; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_addc_u32 s40, s42, 0 +; SI-NEXT: s_lshr_b32 s41, s22, 16 +; SI-NEXT: s_lshr_b32 s42, s40, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s43, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s56, s24, 16 -; SI-NEXT: s_lshr_b32 s57, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s58, s26, 16 -; SI-NEXT: s_lshr_b32 s59, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s60, s28, 16 -; SI-NEXT: s_lshr_b32 s61, s29, 16 -; SI-NEXT: s_add_u32 s40, s40, 3 -; SI-NEXT: s_addc_u32 s41, s41, 0 -; SI-NEXT: s_lshr_b32 s62, s40, 16 -; SI-NEXT: s_lshr_b32 s63, s41, 16 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 +; SI-NEXT: s_addc_u32 s26, s26, 0 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s57, s26, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s58, s20, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s60, s18, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s62, s16, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 16 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_lshr_b32 s72, s14, 16 @@ -28668,20 +29122,20 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s5 ; SI-NEXT: s_waitcnt expcnt(1) @@ -28708,11 +29162,11 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s28 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 @@ -28960,9 +29414,37 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v19, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v13 +; VI-NEXT: v_mov_b32_e32 v13, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_mov_b32_e32 v14, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v15 +; VI-NEXT: v_mov_b32_e32 v15, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v16 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v17 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v18 +; VI-NEXT: v_mov_b32_e32 v18, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v19 +; VI-NEXT: v_mov_b32_e32 v19, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_readfirstlane_b32 s41, v0 -; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s24, v13 +; VI-NEXT: v_readfirstlane_b32 s23, v14 +; VI-NEXT: v_readfirstlane_b32 s22, v15 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s20, v17 +; VI-NEXT: v_readfirstlane_b32 s19, v18 +; VI-NEXT: v_readfirstlane_b32 s18, v19 +; VI-NEXT: v_readfirstlane_b32 s17, v0 +; VI-NEXT: v_readfirstlane_b32 s16, v1 ; VI-NEXT: v_readfirstlane_b32 s15, v2 ; VI-NEXT: v_readfirstlane_b32 s14, v3 ; VI-NEXT: v_readfirstlane_b32 s13, v4 @@ -28976,9 +29458,9 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s7, v11 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -28986,22 +29468,22 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -29014,25 +29496,25 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s42, s7, 16 -; VI-NEXT: s_lshr_b32 s43, s6, 16 -; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s45, s9, 16 ; VI-NEXT: s_lshr_b32 s46, s10, 16 ; VI-NEXT: s_lshr_b32 s47, s11, 16 @@ -29040,117 +29522,117 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s57, s13, 16 ; VI-NEXT: s_lshr_b32 s58, s14, 16 ; VI-NEXT: s_lshr_b32 s59, s15, 16 -; VI-NEXT: s_lshr_b32 s60, s40, 16 -; VI-NEXT: s_lshr_b32 s61, s41, 16 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s72, s27, 16 -; VI-NEXT: s_lshr_b32 s73, s26, 16 -; VI-NEXT: s_lshr_b32 s74, s25, 16 -; VI-NEXT: s_lshr_b32 s75, s24, 16 -; VI-NEXT: s_lshr_b32 s76, s23, 16 -; VI-NEXT: s_lshr_b32 s77, s22, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 16 -; VI-NEXT: s_lshr_b32 s79, s20, 16 -; VI-NEXT: s_lshr_b32 s88, s19, 16 -; VI-NEXT: s_lshr_b32 s89, s18, 16 -; VI-NEXT: s_lshr_b32 s90, s17, 16 -; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_lshr_b32 s60, s16, 16 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 16 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s74, s22, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 16 +; VI-NEXT: s_lshr_b32 s76, s24, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 16 +; VI-NEXT: s_lshr_b32 s79, s40, 16 +; VI-NEXT: s_lshr_b32 s88, s41, 16 +; VI-NEXT: s_lshr_b32 s89, s42, 16 +; VI-NEXT: s_lshr_b32 s90, s43, 16 +; VI-NEXT: s_lshr_b32 s91, s44, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s44 ; VI-NEXT: s_lshl_b32 s5, s91, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s90, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s89, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s88, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s79, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s78, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s77, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s76, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s75, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s74, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s73, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s72, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s63, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s62, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s29, s61, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s40, s60, 16 -; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s5, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s43, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s43 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s43, s89, 16 +; VI-NEXT: s_or_b32 s42, s42, s43 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s43, s88, 16 +; VI-NEXT: s_or_b32 s41, s41, s43 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s43, s79, 16 +; VI-NEXT: s_or_b32 s40, s40, s43 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s43, s78, 16 +; VI-NEXT: s_or_b32 s26, s26, s43 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s43, s77, 16 +; VI-NEXT: s_or_b32 s25, s25, s43 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s43, s76, 16 +; VI-NEXT: s_or_b32 s24, s24, s43 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s43, s75, 16 +; VI-NEXT: s_or_b32 s23, s23, s43 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s43, s74, 16 +; VI-NEXT: s_or_b32 s22, s22, s43 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s43, s73, 16 +; VI-NEXT: s_or_b32 s21, s21, s43 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s43, s72, 16 +; VI-NEXT: s_or_b32 s20, s20, s43 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s43, s63, 16 +; VI-NEXT: s_or_b32 s19, s19, s43 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s43, s62, 16 +; VI-NEXT: s_or_b32 s18, s18, s43 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s43, s61, 16 +; VI-NEXT: s_or_b32 s17, s17, s43 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s43, s60, 16 +; VI-NEXT: s_or_b32 s16, s16, s43 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s40, s59, 16 -; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_lshl_b32 s43, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s43 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s40, s58, 16 -; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_lshl_b32 s43, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s43 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s40, s57, 16 -; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_lshl_b32 s43, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s43 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s40, s56, 16 -; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_lshl_b32 s43, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s43 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s40, s47, 16 -; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_lshl_b32 s43, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s43 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s40, s46, 16 -; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_lshl_b32 s43, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s43 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s40, s45, 16 -; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_lshl_b32 s43, s45, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s40, s44, 16 -; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s40, s43, 16 -; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s40, s42, 16 -; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s43 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_mov_b32_e32 v3, s41 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s18 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v15, s16 ; VI-NEXT: v_mov_b32_e32 v16, s15 ; VI-NEXT: v_mov_b32_e32 v17, s14 ; VI-NEXT: v_mov_b32_e32 v18, s13 @@ -29186,25 +29668,53 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v13i64_to_v52f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v19, s22 +; GFX9-NEXT: v_readfirstlane_b32 s6, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, s23 +; GFX9-NEXT: v_readfirstlane_b32 s7, v14 +; GFX9-NEXT: v_mov_b32_e32 v14, s24 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s25 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_readfirstlane_b32 s10, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: v_readfirstlane_b32 s11, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s28 +; GFX9-NEXT: v_readfirstlane_b32 s12, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: v_readfirstlane_b32 s16, v16 +; GFX9-NEXT: v_readfirstlane_b32 s17, v17 +; GFX9-NEXT: v_readfirstlane_b32 s18, v18 +; GFX9-NEXT: v_readfirstlane_b32 s19, v19 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 @@ -29212,44 +29722,34 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -29264,83 +29764,93 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s42, s41, 16 ; GFX9-NEXT: s_lshr_b32 s43, s40, 16 -; GFX9-NEXT: s_lshr_b32 s44, s15, 16 -; GFX9-NEXT: s_lshr_b32 s45, s14, 16 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: s_lshr_b32 s47, s12, 16 -; GFX9-NEXT: s_lshr_b32 s56, s11, 16 -; GFX9-NEXT: s_lshr_b32 s57, s10, 16 -; GFX9-NEXT: s_lshr_b32 s58, s9, 16 -; GFX9-NEXT: s_lshr_b32 s59, s8, 16 -; GFX9-NEXT: s_lshr_b32 s60, s7, 16 -; GFX9-NEXT: s_lshr_b32 s61, s6, 16 -; GFX9-NEXT: s_lshr_b32 s62, s29, 16 -; GFX9-NEXT: s_lshr_b32 s63, s28, 16 -; GFX9-NEXT: s_lshr_b32 s72, s27, 16 -; GFX9-NEXT: s_lshr_b32 s73, s26, 16 -; GFX9-NEXT: s_lshr_b32 s74, s25, 16 -; GFX9-NEXT: s_lshr_b32 s75, s24, 16 -; GFX9-NEXT: s_lshr_b32 s76, s23, 16 -; GFX9-NEXT: s_lshr_b32 s77, s22, 16 -; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s79, s20, 16 -; GFX9-NEXT: s_lshr_b32 s88, s19, 16 -; GFX9-NEXT: s_lshr_b32 s89, s18, 16 -; GFX9-NEXT: s_lshr_b32 s90, s17, 16 -; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_lshr_b32 s44, s29, 16 +; GFX9-NEXT: s_lshr_b32 s45, s28, 16 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: s_lshr_b32 s47, s26, 16 +; GFX9-NEXT: s_lshr_b32 s56, s25, 16 +; GFX9-NEXT: s_lshr_b32 s57, s24, 16 +; GFX9-NEXT: s_lshr_b32 s58, s23, 16 +; GFX9-NEXT: s_lshr_b32 s59, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s21, 16 +; GFX9-NEXT: s_lshr_b32 s61, s20, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 16 +; GFX9-NEXT: s_lshr_b32 s72, s17, 16 +; GFX9-NEXT: s_lshr_b32 s73, s16, 16 +; GFX9-NEXT: s_lshr_b32 s74, s15, 16 +; GFX9-NEXT: s_lshr_b32 s75, s14, 16 +; GFX9-NEXT: s_lshr_b32 s76, s13, 16 +; GFX9-NEXT: s_lshr_b32 s77, s12, 16 +; GFX9-NEXT: s_lshr_b32 s78, s11, 16 +; GFX9-NEXT: s_lshr_b32 s79, s10, 16 +; GFX9-NEXT: s_lshr_b32 s88, s9, 16 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshr_b32 s90, s7, 16 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s44 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -29376,41 +29886,68 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX11-LABEL: bitcast_v13i64_to_v52f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v14, s17 +; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v16, s19 +; GFX11-NEXT: v_dual_mov_b32 v17, s20 :: v_dual_mov_b32 v18, s21 +; GFX11-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-NEXT: v_dual_mov_b32 v21, s24 :: v_dual_mov_b32 v22, s25 +; GFX11-NEXT: v_dual_mov_b32 v23, s26 :: v_dual_mov_b32 v24, s27 +; GFX11-NEXT: v_dual_mov_b32 v25, s28 :: v_dual_mov_b32 v26, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s11, v6 -; GFX11-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_readfirstlane_b32 s3, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: v_readfirstlane_b32 s6, v15 +; GFX11-NEXT: v_readfirstlane_b32 s7, v16 +; GFX11-NEXT: v_readfirstlane_b32 s8, v17 +; GFX11-NEXT: v_readfirstlane_b32 s9, v18 +; GFX11-NEXT: v_readfirstlane_b32 s10, v19 +; GFX11-NEXT: v_readfirstlane_b32 s11, v20 +; GFX11-NEXT: v_readfirstlane_b32 s12, v21 +; GFX11-NEXT: v_readfirstlane_b32 s13, v22 +; GFX11-NEXT: v_readfirstlane_b32 s14, v23 +; GFX11-NEXT: v_readfirstlane_b32 s15, v24 +; GFX11-NEXT: v_readfirstlane_b32 s16, v25 +; GFX11-NEXT: v_readfirstlane_b32 s17, v26 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s25, v6 +; GFX11-NEXT: v_readfirstlane_b32 s24, v7 ; GFX11-NEXT: s_mov_b32 s78, 0 -; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s26, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -29418,20 +29955,8 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s11, s11, 3 -; GFX11-NEXT: s_addc_u32 s10, s10, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s25, s25, 3 +; GFX11-NEXT: s_addc_u32 s24, s24, 0 ; GFX11-NEXT: s_add_u32 s22, s22, 3 ; GFX11-NEXT: s_addc_u32 s23, s23, 0 ; GFX11-NEXT: s_add_u32 s20, s20, 3 @@ -29440,32 +29965,44 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s12, s10, 16 -; GFX11-NEXT: s_lshr_b32 s13, s11, 16 -; GFX11-NEXT: s_lshr_b32 s14, s9, 16 -; GFX11-NEXT: s_lshr_b32 s15, s8, 16 -; GFX11-NEXT: s_lshr_b32 s40, s7, 16 -; GFX11-NEXT: s_lshr_b32 s41, s6, 16 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s44, s29, 16 -; GFX11-NEXT: s_lshr_b32 s45, s28, 16 -; GFX11-NEXT: s_lshr_b32 s46, s27, 16 -; GFX11-NEXT: s_lshr_b32 s47, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s25, 16 -; GFX11-NEXT: s_lshr_b32 s57, s24, 16 -; GFX11-NEXT: s_lshr_b32 s58, s23, 16 -; GFX11-NEXT: s_lshr_b32 s59, s22, 16 -; GFX11-NEXT: s_lshr_b32 s60, s21, 16 -; GFX11-NEXT: s_lshr_b32 s61, s20, 16 -; GFX11-NEXT: s_lshr_b32 s62, s19, 16 -; GFX11-NEXT: s_lshr_b32 s63, s18, 16 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-NEXT: s_lshr_b32 s26, s24, 16 +; GFX11-NEXT: s_lshr_b32 s27, s25, 16 +; GFX11-NEXT: s_lshr_b32 s28, s23, 16 +; GFX11-NEXT: s_lshr_b32 s29, s22, 16 +; GFX11-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-NEXT: s_lshr_b32 s46, s15, 16 +; GFX11-NEXT: s_lshr_b32 s47, s14, 16 +; GFX11-NEXT: s_lshr_b32 s56, s13, 16 +; GFX11-NEXT: s_lshr_b32 s57, s12, 16 +; GFX11-NEXT: s_lshr_b32 s58, s11, 16 +; GFX11-NEXT: s_lshr_b32 s59, s10, 16 +; GFX11-NEXT: s_lshr_b32 s60, s9, 16 +; GFX11-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-NEXT: s_lshr_b32 s62, s7, 16 +; GFX11-NEXT: s_lshr_b32 s63, s6, 16 +; GFX11-NEXT: s_lshr_b32 s72, s5, 16 +; GFX11-NEXT: s_lshr_b32 s73, s4, 16 ; GFX11-NEXT: s_lshr_b32 s74, s3, 16 ; GFX11-NEXT: s_lshr_b32 s75, s2, 16 ; GFX11-NEXT: s_lshr_b32 s76, s1, 16 @@ -29476,41 +30013,41 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s76 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s75 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s27 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s26 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s25 :: v_dual_mov_b32 v25, s24 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr77 @@ -29535,10 +30072,10 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr13 -; GFX11-NEXT: ; implicit-def: $sgpr12 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 ; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -33287,8 +33824,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; VI-NEXT: v_mov_b32_e32 v21, s16 ; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 ; VI-NEXT: v_mov_b32_e32 v13, s20 ; VI-NEXT: v_mov_b32_e32 v14, s21 ; VI-NEXT: v_mov_b32_e32 v30, s22 @@ -33298,8 +33835,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v19, s26 ; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 -; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -33318,8 +33855,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -33328,8 +33865,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB49_3 @@ -33340,12 +33877,12 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -33359,8 +33896,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -33369,8 +33906,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; VI-NEXT: .LBB49_3: ; %end @@ -33379,11 +33916,11 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 ; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 ; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 @@ -33404,17 +33941,17 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v17, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33479,8 +34016,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, s16 ; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 ; GFX9-NEXT: v_mov_b32_e32 v13, s20 ; GFX9-NEXT: v_mov_b32_e32 v14, s21 ; GFX9-NEXT: v_mov_b32_e32 v30, s22 @@ -33490,8 +34027,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v19, s26 ; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 -; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -33510,8 +34047,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -33520,8 +34057,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 @@ -33532,12 +34069,12 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -33551,8 +34088,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -33561,8 +34098,8 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; GFX9-NEXT: .LBB49_3: ; %end @@ -33582,9 +34119,9 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v18 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 @@ -33592,15 +34129,15 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 @@ -34440,7 +34977,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -37429,19 +37966,21 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s9, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_and_b64 s[40:41], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s28 +; SI-NEXT: v_mov_b32_e32 v14, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -37460,362 +37999,440 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_lshr_b32 s40, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 -; SI-NEXT: s_lshr_b32 s40, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s40 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: s_lshr_b32 s40, s9, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s40 -; SI-NEXT: s_lshr_b32 s40, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 -; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s40 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 -; SI-NEXT: s_lshr_b32 s40, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s40 -; SI-NEXT: s_lshr_b32 s40, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: s_lshr_b32 s40, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 -; SI-NEXT: s_lshr_b32 s40, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 -; SI-NEXT: s_lshr_b32 s40, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 -; SI-NEXT: s_lshr_b32 s40, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s40 -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s40 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: s_lshr_b32 s40, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 -; SI-NEXT: s_lshr_b32 s40, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: s_lshr_b32 s40, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 -; SI-NEXT: s_lshr_b32 s40, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 -; SI-NEXT: s_lshr_b32 s40, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s40 -; SI-NEXT: s_lshr_b32 s40, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s40 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[26:27], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v14 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 -; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v10, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 -; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v53 -; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v48 -; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -37839,58 +38456,73 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: @@ -37899,8 +38531,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; VI-NEXT: v_mov_b32_e32 v21, s16 ; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 ; VI-NEXT: v_mov_b32_e32 v13, s20 ; VI-NEXT: v_mov_b32_e32 v14, s21 ; VI-NEXT: v_mov_b32_e32 v30, s22 @@ -37910,8 +38542,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v19, s26 ; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v15, s28 -; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -37930,8 +38562,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -37940,8 +38572,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB53_3 @@ -37952,12 +38584,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -37971,8 +38603,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -37981,8 +38613,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; VI-NEXT: .LBB53_3: ; %end @@ -37991,11 +38623,11 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 ; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 ; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 @@ -38016,17 +38648,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v17, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -38091,8 +38723,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_mov_b32_e32 v21, s16 ; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 ; GFX9-NEXT: v_mov_b32_e32 v13, s20 ; GFX9-NEXT: v_mov_b32_e32 v14, s21 ; GFX9-NEXT: v_mov_b32_e32 v30, s22 @@ -38102,8 +38734,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v19, s26 ; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v15, s28 -; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -38122,8 +38754,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -38132,8 +38764,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 @@ -38144,12 +38776,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -38163,8 +38795,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 @@ -38173,8 +38805,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 ; GFX9-NEXT: .LBB53_3: ; %end @@ -38194,9 +38826,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v18 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 @@ -38204,15 +38836,15 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 77df03dcdcd9f..0e7bca4f61bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -3638,25 +3638,53 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v16, s30, 0 -; SI-NEXT: v_writelane_b32 v16, s31, 1 -; SI-NEXT: v_writelane_b32 v16, s34, 2 -; SI-NEXT: v_writelane_b32 v16, s35, 3 -; SI-NEXT: v_writelane_b32 v16, s36, 4 -; SI-NEXT: v_writelane_b32 v16, s37, 5 -; SI-NEXT: v_writelane_b32 v16, s38, 6 -; SI-NEXT: v_writelane_b32 v16, s39, 7 -; SI-NEXT: v_writelane_b32 v16, s48, 8 -; SI-NEXT: v_writelane_b32 v16, s49, 9 -; SI-NEXT: v_writelane_b32 v16, s50, 10 -; SI-NEXT: v_writelane_b32 v16, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s44, v16 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_readfirstlane_b32 s45, v17 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_writelane_b32 v16, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s40, v1 -; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s14, v3 ; SI-NEXT: v_readfirstlane_b32 s15, v4 ; SI-NEXT: v_readfirstlane_b32 s12, v5 @@ -3668,9 +3696,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v11 ; SI-NEXT: v_readfirstlane_b32 s7, v12 ; SI-NEXT: v_readfirstlane_b32 s4, v13 -; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_writelane_b32 v16, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s30, s5, 16 @@ -3679,46 +3707,46 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s35, s11, 16 ; SI-NEXT: s_lshr_b32 s36, s13, 16 ; SI-NEXT: s_lshr_b32 s37, s15, 16 -; SI-NEXT: s_lshr_b32 s38, s41, 16 -; SI-NEXT: s_lshr_b32 s39, s29, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 16 -; SI-NEXT: s_lshr_b32 s49, s25, 16 -; SI-NEXT: s_lshr_b32 s50, s23, 16 -; SI-NEXT: s_lshr_b32 s51, s21, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -3731,135 +3759,135 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 ; SI-NEXT: s_lshr_b32 s30, s5, 16 ; SI-NEXT: s_lshr_b32 s31, s7, 16 ; SI-NEXT: s_lshr_b32 s34, s9, 16 ; SI-NEXT: s_lshr_b32 s35, s11, 16 ; SI-NEXT: s_lshr_b32 s36, s13, 16 ; SI-NEXT: s_lshr_b32 s37, s15, 16 -; SI-NEXT: s_lshr_b32 s38, s41, 16 -; SI-NEXT: s_lshr_b32 s39, s29, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 16 -; SI-NEXT: s_lshr_b32 s49, s25, 16 -; SI-NEXT: s_lshr_b32 s50, s23, 16 -; SI-NEXT: s_lshr_b32 s51, s21, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s43, s92, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s43 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s90, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s53, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s52, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s16, s88, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s51, 16 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s50, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s49, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s48, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s39, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s38, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -3923,7 +3951,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s44, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3937,7 +3965,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3951,22 +3979,22 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s53, v16, 13 -; SI-NEXT: v_readlane_b32 s52, v16, 12 -; SI-NEXT: v_readlane_b32 s51, v16, 11 -; SI-NEXT: v_readlane_b32 s50, v16, 10 -; SI-NEXT: v_readlane_b32 s49, v16, 9 -; SI-NEXT: v_readlane_b32 s48, v16, 8 -; SI-NEXT: v_readlane_b32 s39, v16, 7 -; SI-NEXT: v_readlane_b32 s38, v16, 6 -; SI-NEXT: v_readlane_b32 s37, v16, 5 -; SI-NEXT: v_readlane_b32 s36, v16, 4 -; SI-NEXT: v_readlane_b32 s35, v16, 3 -; SI-NEXT: v_readlane_b32 s34, v16, 2 -; SI-NEXT: v_readlane_b32 s31, v16, 1 -; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3995,9 +4023,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB13_2 ; @@ -4007,14 +4035,42 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_readfirstlane_b32 s46, v15 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v17 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v18 +; VI-NEXT: v_mov_b32_e32 v18, s24 ; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s42, v19 +; VI-NEXT: v_mov_b32_e32 v19, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v15 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v16 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; VI-NEXT: v_writelane_b32 v28, s34, 2 -; VI-NEXT: v_readfirstlane_b32 s43, v0 -; VI-NEXT: v_readfirstlane_b32 s42, v1 -; VI-NEXT: v_readfirstlane_b32 s41, v2 -; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v15 +; VI-NEXT: v_readfirstlane_b32 s22, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v0 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: v_readfirstlane_b32 s16, v3 ; VI-NEXT: v_readfirstlane_b32 s15, v4 ; VI-NEXT: v_readfirstlane_b32 s14, v5 ; VI-NEXT: v_readfirstlane_b32 s13, v6 @@ -4029,9 +4085,9 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v28, s35, 3 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -4039,24 +4095,24 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -4069,27 +4125,27 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_add_i32 s42, s42, 3 ; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -4097,127 +4153,127 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s46 ; VI-NEXT: s_lshl_b32 s5, s35, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s34, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s31, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s30, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s91, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s90, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s89, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s88, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s79, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s78, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s77, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s76, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s75, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s74, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s29, s73, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s42, s72, 16 -; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s5, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s45, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s45 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s45, s31, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s45, s30, 16 +; VI-NEXT: s_or_b32 s43, s43, s45 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s45, s91, 16 +; VI-NEXT: s_or_b32 s42, s42, s45 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s42, s63, 16 -; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_lshl_b32 s45, s90, 16 +; VI-NEXT: s_or_b32 s41, s41, s45 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s42, s62, 16 -; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_lshl_b32 s45, s89, 16 +; VI-NEXT: s_or_b32 s40, s40, s45 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s45, s88, 16 +; VI-NEXT: s_or_b32 s26, s26, s45 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s45, s79, 16 +; VI-NEXT: s_or_b32 s25, s25, s45 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s45, s78, 16 +; VI-NEXT: s_or_b32 s24, s24, s45 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s45, s77, 16 +; VI-NEXT: s_or_b32 s23, s23, s45 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s45, s76, 16 +; VI-NEXT: s_or_b32 s22, s22, s45 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s45, s75, 16 +; VI-NEXT: s_or_b32 s21, s21, s45 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s45, s74, 16 +; VI-NEXT: s_or_b32 s20, s20, s45 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s45, s73, 16 +; VI-NEXT: s_or_b32 s19, s19, s45 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s72, 16 +; VI-NEXT: s_or_b32 s18, s18, s45 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s45, s63, 16 +; VI-NEXT: s_or_b32 s17, s17, s45 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s45, s62, 16 +; VI-NEXT: s_or_b32 s16, s16, s45 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s42, s61, 16 -; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_lshl_b32 s45, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s45 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s42, s60, 16 -; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_lshl_b32 s45, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s45 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s42, s59, 16 -; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_lshl_b32 s45, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s45 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s42, s58, 16 -; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_lshl_b32 s45, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s45 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s42, s57, 16 -; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_lshl_b32 s45, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s45 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s42, s56, 16 -; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_lshl_b32 s45, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s45 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s42, s47, 16 -; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_lshl_b32 s45, s47, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s42, s46, 16 -; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s42, s45, 16 -; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s42, s44, 16 -; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s45 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s41 -; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s43 +; VI-NEXT: v_mov_b32_e32 v4, s42 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s40 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s15 ; VI-NEXT: v_mov_b32_e32 v19, s14 ; VI-NEXT: v_mov_b32_e32 v20, s13 @@ -4263,25 +4319,53 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v28i32_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_readfirstlane_b32 s8, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_readfirstlane_b32 s9, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s24 +; GFX9-NEXT: v_readfirstlane_b32 s10, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s25 +; GFX9-NEXT: v_readfirstlane_b32 s11, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_readfirstlane_b32 s12, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_readfirstlane_b32 s13, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s14, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v19 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -4293,46 +4377,36 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s43, s43, 3 ; GFX9-NEXT: s_add_i32 s42, s42, 3 ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -4347,87 +4421,97 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s44, s43, 16 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s56 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 ; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -4467,45 +4551,72 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX11-LABEL: bitcast_v28i32_to_v56i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v14, s3 +; GFX11-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s19 +; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-NEXT: v_dual_mov_b32 v21, s22 :: v_dual_mov_b32 v22, s23 +; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: v_dual_mov_b32 v25, s26 :: v_dual_mov_b32 v26, s27 +; GFX11-NEXT: v_dual_mov_b32 v27, s28 :: v_dual_mov_b32 v28, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s13, v8 -; GFX11-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-NEXT: v_readfirstlane_b32 s2, v13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v14 +; GFX11-NEXT: v_readfirstlane_b32 s4, v15 +; GFX11-NEXT: v_readfirstlane_b32 s5, v16 +; GFX11-NEXT: v_readfirstlane_b32 s6, v17 +; GFX11-NEXT: v_readfirstlane_b32 s7, v18 +; GFX11-NEXT: v_readfirstlane_b32 s8, v19 +; GFX11-NEXT: v_readfirstlane_b32 s9, v20 +; GFX11-NEXT: v_readfirstlane_b32 s10, v21 +; GFX11-NEXT: v_readfirstlane_b32 s11, v22 +; GFX11-NEXT: v_readfirstlane_b32 s12, v23 +; GFX11-NEXT: v_readfirstlane_b32 s13, v24 +; GFX11-NEXT: v_readfirstlane_b32 s14, v25 +; GFX11-NEXT: v_readfirstlane_b32 s15, v26 +; GFX11-NEXT: v_readfirstlane_b32 s16, v27 +; GFX11-NEXT: v_readfirstlane_b32 s17, v28 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s27, v8 +; GFX11-NEXT: v_readfirstlane_b32 s26, v9 ; GFX11-NEXT: s_mov_b32 s90, 0 -; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -4513,20 +4624,8 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s24, s24, 3 ; GFX11-NEXT: s_add_i32 s23, s23, 3 @@ -4537,34 +4636,46 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -4575,44 +4686,44 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s88 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s79 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s28 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s27 :: v_dual_mov_b32 v27, s26 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr89 @@ -4641,8 +4752,8 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 ; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5128,7 +5239,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -8458,11 +8569,39 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v2 -; SI-NEXT: v_readfirstlane_b32 s41, v3 -; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s29, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 ; SI-NEXT: v_readfirstlane_b32 s15, v5 ; SI-NEXT: v_readfirstlane_b32 s14, v6 ; SI-NEXT: v_readfirstlane_b32 s13, v7 @@ -8505,44 +8644,44 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -8555,33 +8694,30 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 @@ -8589,10 +8725,13 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -8603,24 +8742,24 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s44, s18, 16 -; SI-NEXT: s_lshr_b32 s45, s19, 16 -; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: s_lshr_b32 s47, s21, 16 -; SI-NEXT: s_lshr_b32 s56, s22, 16 -; SI-NEXT: s_lshr_b32 s57, s23, 16 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s26, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_lshr_b32 s62, s28, 16 -; SI-NEXT: s_lshr_b32 s63, s29, 16 -; SI-NEXT: s_lshr_b32 s72, s43, 16 -; SI-NEXT: s_lshr_b32 s73, s42, 16 -; SI-NEXT: s_lshr_b32 s74, s41, 16 -; SI-NEXT: s_lshr_b32 s75, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s5, s41, 16 +; SI-NEXT: s_lshr_b32 s44, s42, 16 +; SI-NEXT: s_lshr_b32 s45, s43, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: s_lshr_b32 s47, s24, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s57, s26, 16 +; SI-NEXT: s_lshr_b32 s58, s27, 16 +; SI-NEXT: s_lshr_b32 s59, s28, 16 +; SI-NEXT: s_lshr_b32 s60, s29, 16 +; SI-NEXT: s_lshr_b32 s61, s22, 16 +; SI-NEXT: s_lshr_b32 s62, s21, 16 +; SI-NEXT: s_lshr_b32 s63, s20, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s16, 16 ; SI-NEXT: s_lshr_b32 s76, s15, 16 ; SI-NEXT: s_lshr_b32 s77, s14, 16 ; SI-NEXT: s_lshr_b32 s78, s13, 16 @@ -8641,27 +8780,27 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s42 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 @@ -8963,14 +9102,42 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_readfirstlane_b32 s46, v15 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v17 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v18 +; VI-NEXT: v_mov_b32_e32 v18, s24 ; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s42, v19 +; VI-NEXT: v_mov_b32_e32 v19, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v15 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v16 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; VI-NEXT: v_writelane_b32 v28, s34, 2 -; VI-NEXT: v_readfirstlane_b32 s43, v0 -; VI-NEXT: v_readfirstlane_b32 s42, v1 -; VI-NEXT: v_readfirstlane_b32 s41, v2 -; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v15 +; VI-NEXT: v_readfirstlane_b32 s22, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v0 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: v_readfirstlane_b32 s16, v3 ; VI-NEXT: v_readfirstlane_b32 s15, v4 ; VI-NEXT: v_readfirstlane_b32 s14, v5 ; VI-NEXT: v_readfirstlane_b32 s13, v6 @@ -8985,9 +9152,9 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: v_writelane_b32 v28, s35, 3 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -8995,24 +9162,24 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -9025,27 +9192,27 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_add_i32 s42, s42, 3 ; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -9053,127 +9220,127 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s46 ; VI-NEXT: s_lshl_b32 s5, s35, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s34, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s31, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s30, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s91, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s90, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s89, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s88, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s79, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s78, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s77, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s76, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s75, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s74, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s29, s73, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s42, s72, 16 -; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s5, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s45, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s45 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s45, s31, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s45, s30, 16 +; VI-NEXT: s_or_b32 s43, s43, s45 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s45, s91, 16 +; VI-NEXT: s_or_b32 s42, s42, s45 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s42, s63, 16 -; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_lshl_b32 s45, s90, 16 +; VI-NEXT: s_or_b32 s41, s41, s45 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s42, s62, 16 -; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_lshl_b32 s45, s89, 16 +; VI-NEXT: s_or_b32 s40, s40, s45 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s45, s88, 16 +; VI-NEXT: s_or_b32 s26, s26, s45 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s45, s79, 16 +; VI-NEXT: s_or_b32 s25, s25, s45 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s45, s78, 16 +; VI-NEXT: s_or_b32 s24, s24, s45 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s45, s77, 16 +; VI-NEXT: s_or_b32 s23, s23, s45 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s45, s76, 16 +; VI-NEXT: s_or_b32 s22, s22, s45 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s45, s75, 16 +; VI-NEXT: s_or_b32 s21, s21, s45 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s45, s74, 16 +; VI-NEXT: s_or_b32 s20, s20, s45 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s45, s73, 16 +; VI-NEXT: s_or_b32 s19, s19, s45 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s72, 16 +; VI-NEXT: s_or_b32 s18, s18, s45 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s45, s63, 16 +; VI-NEXT: s_or_b32 s17, s17, s45 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s45, s62, 16 +; VI-NEXT: s_or_b32 s16, s16, s45 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s42, s61, 16 -; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_lshl_b32 s45, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s45 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s42, s60, 16 -; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_lshl_b32 s45, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s45 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s42, s59, 16 -; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_lshl_b32 s45, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s45 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s42, s58, 16 -; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_lshl_b32 s45, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s45 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s42, s57, 16 -; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_lshl_b32 s45, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s45 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s42, s56, 16 -; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_lshl_b32 s45, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s45 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s42, s47, 16 -; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_lshl_b32 s45, s47, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s42, s46, 16 -; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s42, s45, 16 -; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s42, s44, 16 -; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s45 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s41 -; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s43 +; VI-NEXT: v_mov_b32_e32 v4, s42 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s40 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s15 ; VI-NEXT: v_mov_b32_e32 v19, s14 ; VI-NEXT: v_mov_b32_e32 v20, s13 @@ -9219,25 +9386,53 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v28i32_to_v56f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_readfirstlane_b32 s8, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_readfirstlane_b32 s9, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s24 +; GFX9-NEXT: v_readfirstlane_b32 s10, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s25 +; GFX9-NEXT: v_readfirstlane_b32 s11, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_readfirstlane_b32 s12, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_readfirstlane_b32 s13, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s14, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v19 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -9249,46 +9444,36 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s43, s43, 3 ; GFX9-NEXT: s_add_i32 s42, s42, 3 ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -9303,87 +9488,97 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s44, s43, 16 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s56 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 ; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -9423,45 +9618,72 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX11-LABEL: bitcast_v28i32_to_v56f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v14, s3 +; GFX11-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s19 +; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-NEXT: v_dual_mov_b32 v21, s22 :: v_dual_mov_b32 v22, s23 +; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: v_dual_mov_b32 v25, s26 :: v_dual_mov_b32 v26, s27 +; GFX11-NEXT: v_dual_mov_b32 v27, s28 :: v_dual_mov_b32 v28, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s13, v8 -; GFX11-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-NEXT: v_readfirstlane_b32 s2, v13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v14 +; GFX11-NEXT: v_readfirstlane_b32 s4, v15 +; GFX11-NEXT: v_readfirstlane_b32 s5, v16 +; GFX11-NEXT: v_readfirstlane_b32 s6, v17 +; GFX11-NEXT: v_readfirstlane_b32 s7, v18 +; GFX11-NEXT: v_readfirstlane_b32 s8, v19 +; GFX11-NEXT: v_readfirstlane_b32 s9, v20 +; GFX11-NEXT: v_readfirstlane_b32 s10, v21 +; GFX11-NEXT: v_readfirstlane_b32 s11, v22 +; GFX11-NEXT: v_readfirstlane_b32 s12, v23 +; GFX11-NEXT: v_readfirstlane_b32 s13, v24 +; GFX11-NEXT: v_readfirstlane_b32 s14, v25 +; GFX11-NEXT: v_readfirstlane_b32 s15, v26 +; GFX11-NEXT: v_readfirstlane_b32 s16, v27 +; GFX11-NEXT: v_readfirstlane_b32 s17, v28 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s27, v8 +; GFX11-NEXT: v_readfirstlane_b32 s26, v9 ; GFX11-NEXT: s_mov_b32 s90, 0 -; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -9469,20 +9691,8 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s28, s28, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s25, s25, 3 ; GFX11-NEXT: s_add_i32 s24, s24, 3 ; GFX11-NEXT: s_add_i32 s23, s23, 3 @@ -9493,34 +9703,46 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -9531,44 +9753,44 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s88 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s79 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s28 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s27 :: v_dual_mov_b32 v27, s26 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr89 @@ -9597,8 +9819,8 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 ; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16750,7 +16972,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -20053,21 +20275,21 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v2 -; SI-NEXT: v_readfirstlane_b32 s41, v3 -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v22, s18 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v34, s20 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_mov_b32_e32 v35, s21 +; SI-NEXT: v_mov_b32_e32 v33, s22 +; SI-NEXT: v_mov_b32_e32 v32, s23 +; SI-NEXT: v_mov_b32_e32 v31, s24 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v28, s26 +; SI-NEXT: v_mov_b32_e32 v27, s27 +; SI-NEXT: v_mov_b32_e32 v26, s28 +; SI-NEXT: v_mov_b32_e32 v24, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -20086,407 +20308,514 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v25 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_add_f32_e64 v14, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v36, s6, 1.0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 -; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 -; SI-NEXT: v_mov_b32_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v39, v13 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_add_i32_e32 v31, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -20510,62 +20839,83 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: @@ -26047,25 +26397,53 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v16, s30, 0 -; SI-NEXT: v_writelane_b32 v16, s31, 1 -; SI-NEXT: v_writelane_b32 v16, s34, 2 -; SI-NEXT: v_writelane_b32 v16, s35, 3 -; SI-NEXT: v_writelane_b32 v16, s36, 4 -; SI-NEXT: v_writelane_b32 v16, s37, 5 -; SI-NEXT: v_writelane_b32 v16, s38, 6 -; SI-NEXT: v_writelane_b32 v16, s39, 7 -; SI-NEXT: v_writelane_b32 v16, s48, 8 -; SI-NEXT: v_writelane_b32 v16, s49, 9 -; SI-NEXT: v_writelane_b32 v16, s50, 10 -; SI-NEXT: v_writelane_b32 v16, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s44, v16 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_readfirstlane_b32 s45, v17 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_writelane_b32 v16, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s40, v1 -; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s14, v3 ; SI-NEXT: v_readfirstlane_b32 s15, v4 ; SI-NEXT: v_readfirstlane_b32 s12, v5 @@ -26077,9 +26455,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v11 ; SI-NEXT: v_readfirstlane_b32 s7, v12 ; SI-NEXT: v_readfirstlane_b32 s4, v13 -; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_writelane_b32 v16, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s30, s5, 16 @@ -26088,28 +26466,28 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s35, s11, 16 ; SI-NEXT: s_lshr_b32 s36, s13, 16 ; SI-NEXT: s_lshr_b32 s37, s15, 16 -; SI-NEXT: s_lshr_b32 s38, s41, 16 -; SI-NEXT: s_lshr_b32 s39, s29, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 16 -; SI-NEXT: s_lshr_b32 s49, s25, 16 -; SI-NEXT: s_lshr_b32 s50, s23, 16 -; SI-NEXT: s_lshr_b32 s51, s21, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -26124,151 +26502,151 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_add_u32 s40, s40, 3 -; SI-NEXT: s_addc_u32 s41, s41, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 ; SI-NEXT: s_lshr_b32 s30, s5, 16 ; SI-NEXT: s_lshr_b32 s31, s7, 16 ; SI-NEXT: s_lshr_b32 s34, s9, 16 ; SI-NEXT: s_lshr_b32 s35, s11, 16 ; SI-NEXT: s_lshr_b32 s36, s13, 16 ; SI-NEXT: s_lshr_b32 s37, s15, 16 -; SI-NEXT: s_lshr_b32 s38, s41, 16 -; SI-NEXT: s_lshr_b32 s39, s29, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 16 -; SI-NEXT: s_lshr_b32 s49, s25, 16 -; SI-NEXT: s_lshr_b32 s50, s23, 16 -; SI-NEXT: s_lshr_b32 s51, s21, 16 -; SI-NEXT: s_lshr_b32 s52, s19, 16 -; SI-NEXT: s_lshr_b32 s53, s17, 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s43, s92, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s43 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s90, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s53, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s52, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s16, s88, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s51, 16 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s50, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s49, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s48, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s39, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s38, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -26332,7 +26710,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s44, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -26346,7 +26724,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -26360,22 +26738,22 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s53, v16, 13 -; SI-NEXT: v_readlane_b32 s52, v16, 12 -; SI-NEXT: v_readlane_b32 s51, v16, 11 -; SI-NEXT: v_readlane_b32 s50, v16, 10 -; SI-NEXT: v_readlane_b32 s49, v16, 9 -; SI-NEXT: v_readlane_b32 s48, v16, 8 -; SI-NEXT: v_readlane_b32 s39, v16, 7 -; SI-NEXT: v_readlane_b32 s38, v16, 6 -; SI-NEXT: v_readlane_b32 s37, v16, 5 -; SI-NEXT: v_readlane_b32 s36, v16, 4 -; SI-NEXT: v_readlane_b32 s35, v16, 3 -; SI-NEXT: v_readlane_b32 s34, v16, 2 -; SI-NEXT: v_readlane_b32 s31, v16, 1 -; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -26404,9 +26782,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB41_2 ; @@ -26416,14 +26794,42 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_readfirstlane_b32 s46, v15 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v17 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v18 +; VI-NEXT: v_mov_b32_e32 v18, s24 ; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s42, v19 +; VI-NEXT: v_mov_b32_e32 v19, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v15 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v16 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; VI-NEXT: v_writelane_b32 v28, s34, 2 -; VI-NEXT: v_readfirstlane_b32 s43, v0 -; VI-NEXT: v_readfirstlane_b32 s42, v1 -; VI-NEXT: v_readfirstlane_b32 s41, v2 -; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v15 +; VI-NEXT: v_readfirstlane_b32 s22, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v0 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: v_readfirstlane_b32 s16, v3 ; VI-NEXT: v_readfirstlane_b32 s15, v4 ; VI-NEXT: v_readfirstlane_b32 s14, v5 ; VI-NEXT: v_readfirstlane_b32 s13, v6 @@ -26438,9 +26844,9 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v28, s35, 3 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -26448,24 +26854,24 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -26478,27 +26884,27 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s43, s43, 3 -; VI-NEXT: s_addc_u32 s42, s42, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s46, s46, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -26506,127 +26912,127 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s46 ; VI-NEXT: s_lshl_b32 s5, s35, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s34, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s31, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s30, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s91, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s90, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s89, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s88, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s79, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s78, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s77, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s76, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s75, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s74, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s29, s73, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s42, s72, 16 -; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s5, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s45, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s45 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s45, s31, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s45, s30, 16 +; VI-NEXT: s_or_b32 s43, s43, s45 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s45, s91, 16 +; VI-NEXT: s_or_b32 s42, s42, s45 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s42, s63, 16 -; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_lshl_b32 s45, s90, 16 +; VI-NEXT: s_or_b32 s41, s41, s45 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s42, s62, 16 -; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_lshl_b32 s45, s89, 16 +; VI-NEXT: s_or_b32 s40, s40, s45 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s45, s88, 16 +; VI-NEXT: s_or_b32 s26, s26, s45 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s45, s79, 16 +; VI-NEXT: s_or_b32 s25, s25, s45 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s45, s78, 16 +; VI-NEXT: s_or_b32 s24, s24, s45 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s45, s77, 16 +; VI-NEXT: s_or_b32 s23, s23, s45 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s45, s76, 16 +; VI-NEXT: s_or_b32 s22, s22, s45 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s45, s75, 16 +; VI-NEXT: s_or_b32 s21, s21, s45 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s45, s74, 16 +; VI-NEXT: s_or_b32 s20, s20, s45 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s45, s73, 16 +; VI-NEXT: s_or_b32 s19, s19, s45 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s72, 16 +; VI-NEXT: s_or_b32 s18, s18, s45 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s45, s63, 16 +; VI-NEXT: s_or_b32 s17, s17, s45 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s45, s62, 16 +; VI-NEXT: s_or_b32 s16, s16, s45 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s42, s61, 16 -; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_lshl_b32 s45, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s45 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s42, s60, 16 -; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_lshl_b32 s45, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s45 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s42, s59, 16 -; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_lshl_b32 s45, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s45 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s42, s58, 16 -; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_lshl_b32 s45, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s45 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s42, s57, 16 -; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_lshl_b32 s45, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s45 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s42, s56, 16 -; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_lshl_b32 s45, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s45 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s42, s47, 16 -; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_lshl_b32 s45, s47, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s42, s46, 16 -; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s42, s45, 16 -; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s42, s44, 16 -; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s45 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s41 -; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s43 +; VI-NEXT: v_mov_b32_e32 v4, s42 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s40 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s15 ; VI-NEXT: v_mov_b32_e32 v19, s14 ; VI-NEXT: v_mov_b32_e32 v20, s13 @@ -26672,25 +27078,53 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v14i64_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_readfirstlane_b32 s8, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_readfirstlane_b32 s9, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s24 +; GFX9-NEXT: v_readfirstlane_b32 s10, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s25 +; GFX9-NEXT: v_readfirstlane_b32 s11, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_readfirstlane_b32 s12, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_readfirstlane_b32 s13, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s14, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v19 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -26702,46 +27136,36 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s42, s42, 3 ; GFX9-NEXT: s_addc_u32 s43, s43, 0 ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -26756,87 +27180,97 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s44, s43, 16 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s56 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 ; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -26876,45 +27310,72 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX11-LABEL: bitcast_v14i64_to_v56i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v14, s3 +; GFX11-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s19 +; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-NEXT: v_dual_mov_b32 v21, s22 :: v_dual_mov_b32 v22, s23 +; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: v_dual_mov_b32 v25, s26 :: v_dual_mov_b32 v26, s27 +; GFX11-NEXT: v_dual_mov_b32 v27, s28 :: v_dual_mov_b32 v28, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s13, v8 -; GFX11-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-NEXT: v_readfirstlane_b32 s2, v13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v14 +; GFX11-NEXT: v_readfirstlane_b32 s4, v15 +; GFX11-NEXT: v_readfirstlane_b32 s5, v16 +; GFX11-NEXT: v_readfirstlane_b32 s6, v17 +; GFX11-NEXT: v_readfirstlane_b32 s7, v18 +; GFX11-NEXT: v_readfirstlane_b32 s8, v19 +; GFX11-NEXT: v_readfirstlane_b32 s9, v20 +; GFX11-NEXT: v_readfirstlane_b32 s10, v21 +; GFX11-NEXT: v_readfirstlane_b32 s11, v22 +; GFX11-NEXT: v_readfirstlane_b32 s12, v23 +; GFX11-NEXT: v_readfirstlane_b32 s13, v24 +; GFX11-NEXT: v_readfirstlane_b32 s14, v25 +; GFX11-NEXT: v_readfirstlane_b32 s15, v26 +; GFX11-NEXT: v_readfirstlane_b32 s16, v27 +; GFX11-NEXT: v_readfirstlane_b32 s17, v28 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s27, v8 +; GFX11-NEXT: v_readfirstlane_b32 s26, v9 ; GFX11-NEXT: s_mov_b32 s90, 0 -; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -26922,20 +27383,8 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s13, s13, 3 -; GFX11-NEXT: s_addc_u32 s12, s12, 0 -; GFX11-NEXT: s_add_u32 s10, s10, 3 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s27, s27, 3 +; GFX11-NEXT: s_addc_u32 s26, s26, 0 ; GFX11-NEXT: s_add_u32 s24, s24, 3 ; GFX11-NEXT: s_addc_u32 s25, s25, 0 ; GFX11-NEXT: s_add_u32 s22, s22, 3 @@ -26946,34 +27395,46 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -26984,44 +27445,44 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s88 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s79 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s28 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s27 :: v_dual_mov_b32 v27, s26 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr89 @@ -27050,8 +27511,8 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 ; GFX11-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -27537,7 +27998,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -30881,11 +31342,39 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s40, v16 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_readfirstlane_b32 s44, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s24, v16 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v1 -; SI-NEXT: v_readfirstlane_b32 s43, v2 -; SI-NEXT: v_readfirstlane_b32 s40, v3 -; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_readfirstlane_b32 s19, v2 +; SI-NEXT: v_readfirstlane_b32 s16, v3 +; SI-NEXT: v_readfirstlane_b32 s17, v4 ; SI-NEXT: v_readfirstlane_b32 s14, v5 ; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: v_readfirstlane_b32 s12, v7 @@ -30928,44 +31417,44 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -30978,62 +31467,62 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: s_lshr_b32 s17, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s44, s18, 16 -; SI-NEXT: s_lshr_b32 s45, s19, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_add_u32 s4, s40, 3 +; SI-NEXT: s_addc_u32 s5, s42, 0 +; SI-NEXT: s_lshr_b32 s28, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 +; SI-NEXT: s_add_u32 s40, s41, 3 +; SI-NEXT: s_addc_u32 s41, s43, 0 +; SI-NEXT: s_lshr_b32 s42, s40, 16 +; SI-NEXT: s_lshr_b32 s43, s41, 16 ; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s56, s22, 16 -; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s47, s44, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s45, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s60, s26, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s62, s28, 16 -; SI-NEXT: s_lshr_b32 s63, s29, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s43, s43, 0 -; SI-NEXT: s_lshr_b32 s72, s42, 16 -; SI-NEXT: s_lshr_b32 s73, s43, 16 -; SI-NEXT: s_add_u32 s40, s40, 3 -; SI-NEXT: s_addc_u32 s41, s41, 0 -; SI-NEXT: s_lshr_b32 s74, s40, 16 -; SI-NEXT: s_lshr_b32 s75, s41, 16 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s27, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 +; SI-NEXT: s_addc_u32 s26, s26, 0 +; SI-NEXT: s_lshr_b32 s60, s25, 16 +; SI-NEXT: s_lshr_b32 s61, s26, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s62, s20, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s72, s18, 16 +; SI-NEXT: s_lshr_b32 s73, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s74, s16, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 16 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_lshr_b32 s76, s14, 16 @@ -31064,23 +31553,23 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 ; SI-NEXT: s_waitcnt expcnt(1) @@ -31109,11 +31598,11 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s28 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 @@ -31386,14 +31875,42 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_readfirstlane_b32 s46, v15 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v17 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_readfirstlane_b32 s43, v18 +; VI-NEXT: v_mov_b32_e32 v18, s24 ; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s42, v19 +; VI-NEXT: v_mov_b32_e32 v19, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v15 +; VI-NEXT: v_mov_b32_e32 v15, s26 +; VI-NEXT: v_readfirstlane_b32 s40, v16 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; VI-NEXT: v_writelane_b32 v28, s34, 2 -; VI-NEXT: v_readfirstlane_b32 s43, v0 -; VI-NEXT: v_readfirstlane_b32 s42, v1 -; VI-NEXT: v_readfirstlane_b32 s41, v2 -; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v15 +; VI-NEXT: v_readfirstlane_b32 s22, v16 +; VI-NEXT: v_readfirstlane_b32 s21, v17 +; VI-NEXT: v_readfirstlane_b32 s20, v18 +; VI-NEXT: v_readfirstlane_b32 s19, v0 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: v_readfirstlane_b32 s17, v2 +; VI-NEXT: v_readfirstlane_b32 s16, v3 ; VI-NEXT: v_readfirstlane_b32 s15, v4 ; VI-NEXT: v_readfirstlane_b32 s14, v5 ; VI-NEXT: v_readfirstlane_b32 s13, v6 @@ -31408,9 +31925,9 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: v_writelane_b32 v28, s35, 3 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -31418,24 +31935,24 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -31448,27 +31965,27 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s43, s43, 3 -; VI-NEXT: s_addc_u32 s42, s42, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s44, s7, 16 -; VI-NEXT: s_lshr_b32 s45, s6, 16 -; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s46, s46, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s47, s9, 16 ; VI-NEXT: s_lshr_b32 s56, s10, 16 ; VI-NEXT: s_lshr_b32 s57, s11, 16 @@ -31476,127 +31993,127 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s59, s13, 16 ; VI-NEXT: s_lshr_b32 s60, s14, 16 ; VI-NEXT: s_lshr_b32 s61, s15, 16 -; VI-NEXT: s_lshr_b32 s62, s40, 16 -; VI-NEXT: s_lshr_b32 s63, s41, 16 -; VI-NEXT: s_lshr_b32 s72, s42, 16 -; VI-NEXT: s_lshr_b32 s73, s43, 16 -; VI-NEXT: s_lshr_b32 s74, s29, 16 -; VI-NEXT: s_lshr_b32 s75, s28, 16 -; VI-NEXT: s_lshr_b32 s76, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s25, 16 -; VI-NEXT: s_lshr_b32 s79, s24, 16 -; VI-NEXT: s_lshr_b32 s88, s23, 16 -; VI-NEXT: s_lshr_b32 s89, s22, 16 -; VI-NEXT: s_lshr_b32 s90, s21, 16 -; VI-NEXT: s_lshr_b32 s91, s20, 16 -; VI-NEXT: s_lshr_b32 s30, s19, 16 -; VI-NEXT: s_lshr_b32 s31, s18, 16 -; VI-NEXT: s_lshr_b32 s34, s17, 16 -; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 16 +; VI-NEXT: s_lshr_b32 s63, s17, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s74, s20, 16 +; VI-NEXT: s_lshr_b32 s75, s21, 16 +; VI-NEXT: s_lshr_b32 s76, s22, 16 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s25, 16 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s89, s40, 16 +; VI-NEXT: s_lshr_b32 s90, s41, 16 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s30, s43, 16 +; VI-NEXT: s_lshr_b32 s31, s44, 16 +; VI-NEXT: s_lshr_b32 s34, s45, 16 +; VI-NEXT: s_lshr_b32 s35, s46, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s46 ; VI-NEXT: s_lshl_b32 s5, s35, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s34, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s31, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s30, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s91, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s90, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s89, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s88, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s79, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s78, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s77, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s76, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s75, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s74, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s29, s73, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s42, s72, 16 -; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s5, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s45, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s45 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s45, s31, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s45, s30, 16 +; VI-NEXT: s_or_b32 s43, s43, s45 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s45, s91, 16 +; VI-NEXT: s_or_b32 s42, s42, s45 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s42, s63, 16 -; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_lshl_b32 s45, s90, 16 +; VI-NEXT: s_or_b32 s41, s41, s45 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s42, s62, 16 -; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_lshl_b32 s45, s89, 16 +; VI-NEXT: s_or_b32 s40, s40, s45 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s45, s88, 16 +; VI-NEXT: s_or_b32 s26, s26, s45 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s45, s79, 16 +; VI-NEXT: s_or_b32 s25, s25, s45 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s45, s78, 16 +; VI-NEXT: s_or_b32 s24, s24, s45 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s45, s77, 16 +; VI-NEXT: s_or_b32 s23, s23, s45 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s45, s76, 16 +; VI-NEXT: s_or_b32 s22, s22, s45 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s45, s75, 16 +; VI-NEXT: s_or_b32 s21, s21, s45 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s45, s74, 16 +; VI-NEXT: s_or_b32 s20, s20, s45 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s45, s73, 16 +; VI-NEXT: s_or_b32 s19, s19, s45 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s72, 16 +; VI-NEXT: s_or_b32 s18, s18, s45 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s45, s63, 16 +; VI-NEXT: s_or_b32 s17, s17, s45 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s45, s62, 16 +; VI-NEXT: s_or_b32 s16, s16, s45 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s42, s61, 16 -; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_lshl_b32 s45, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s45 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s42, s60, 16 -; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_lshl_b32 s45, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s45 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s42, s59, 16 -; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_lshl_b32 s45, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s45 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s42, s58, 16 -; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_lshl_b32 s45, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s45 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s42, s57, 16 -; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_lshl_b32 s45, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s45 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s42, s56, 16 -; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_lshl_b32 s45, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s45 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s42, s47, 16 -; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_lshl_b32 s45, s47, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s42, s46, 16 -; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s42, s45, 16 -; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s42, s44, 16 -; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s45 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s41 -; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s43 +; VI-NEXT: v_mov_b32_e32 v4, s42 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s40 +; VI-NEXT: v_mov_b32_e32 v7, s26 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s19 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s15 ; VI-NEXT: v_mov_b32_e32 v19, s14 ; VI-NEXT: v_mov_b32_e32 v20, s13 @@ -31642,25 +32159,53 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v14i64_to_v56f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_readfirstlane_b32 s8, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_readfirstlane_b32 s9, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s24 +; GFX9-NEXT: v_readfirstlane_b32 s10, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s25 +; GFX9-NEXT: v_readfirstlane_b32 s11, v15 +; GFX9-NEXT: v_mov_b32_e32 v15, s26 +; GFX9-NEXT: v_readfirstlane_b32 s12, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_readfirstlane_b32 s13, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s14, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v19 +; GFX9-NEXT: v_readfirstlane_b32 s16, v15 +; GFX9-NEXT: v_readfirstlane_b32 s17, v16 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -31672,46 +32217,36 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s42, s42, 3 ; GFX9-NEXT: s_addc_u32 s43, s43, 0 ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -31726,87 +32261,97 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s44, s43, 16 ; GFX9-NEXT: s_lshr_b32 s45, s42, 16 ; GFX9-NEXT: s_lshr_b32 s46, s41, 16 ; GFX9-NEXT: s_lshr_b32 s47, s40, 16 -; GFX9-NEXT: s_lshr_b32 s56, s15, 16 -; GFX9-NEXT: s_lshr_b32 s57, s14, 16 -; GFX9-NEXT: s_lshr_b32 s58, s13, 16 -; GFX9-NEXT: s_lshr_b32 s59, s12, 16 -; GFX9-NEXT: s_lshr_b32 s60, s11, 16 -; GFX9-NEXT: s_lshr_b32 s61, s10, 16 -; GFX9-NEXT: s_lshr_b32 s62, s9, 16 -; GFX9-NEXT: s_lshr_b32 s63, s8, 16 -; GFX9-NEXT: s_lshr_b32 s72, s7, 16 -; GFX9-NEXT: s_lshr_b32 s73, s6, 16 -; GFX9-NEXT: s_lshr_b32 s74, s29, 16 -; GFX9-NEXT: s_lshr_b32 s75, s28, 16 -; GFX9-NEXT: s_lshr_b32 s76, s27, 16 -; GFX9-NEXT: s_lshr_b32 s77, s26, 16 -; GFX9-NEXT: s_lshr_b32 s78, s25, 16 -; GFX9-NEXT: s_lshr_b32 s79, s24, 16 -; GFX9-NEXT: s_lshr_b32 s88, s23, 16 -; GFX9-NEXT: s_lshr_b32 s89, s22, 16 -; GFX9-NEXT: s_lshr_b32 s90, s21, 16 -; GFX9-NEXT: s_lshr_b32 s91, s20, 16 -; GFX9-NEXT: s_lshr_b32 s92, s19, 16 -; GFX9-NEXT: s_lshr_b32 s93, s18, 16 -; GFX9-NEXT: s_lshr_b32 s94, s17, 16 -; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_lshr_b32 s56, s29, 16 +; GFX9-NEXT: s_lshr_b32 s57, s28, 16 +; GFX9-NEXT: s_lshr_b32 s58, s27, 16 +; GFX9-NEXT: s_lshr_b32 s59, s26, 16 +; GFX9-NEXT: s_lshr_b32 s60, s25, 16 +; GFX9-NEXT: s_lshr_b32 s61, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s23, 16 +; GFX9-NEXT: s_lshr_b32 s63, s22, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 16 +; GFX9-NEXT: s_lshr_b32 s73, s20, 16 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s77, s16, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_lshr_b32 s79, s14, 16 +; GFX9-NEXT: s_lshr_b32 s88, s13, 16 +; GFX9-NEXT: s_lshr_b32 s89, s12, 16 +; GFX9-NEXT: s_lshr_b32 s90, s11, 16 +; GFX9-NEXT: s_lshr_b32 s91, s10, 16 +; GFX9-NEXT: s_lshr_b32 s92, s9, 16 +; GFX9-NEXT: s_lshr_b32 s93, s8, 16 +; GFX9-NEXT: s_lshr_b32 s94, s7, 16 +; GFX9-NEXT: s_lshr_b32 s95, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s56 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 ; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -31846,45 +32391,72 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX11-LABEL: bitcast_v14i64_to_v56f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v14, s3 +; GFX11-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s19 +; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-NEXT: v_dual_mov_b32 v21, s22 :: v_dual_mov_b32 v22, s23 +; GFX11-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-NEXT: v_dual_mov_b32 v25, s26 :: v_dual_mov_b32 v26, s27 +; GFX11-NEXT: v_dual_mov_b32 v27, s28 :: v_dual_mov_b32 v28, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s13, v8 -; GFX11-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-NEXT: v_readfirstlane_b32 s2, v13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v14 +; GFX11-NEXT: v_readfirstlane_b32 s4, v15 +; GFX11-NEXT: v_readfirstlane_b32 s5, v16 +; GFX11-NEXT: v_readfirstlane_b32 s6, v17 +; GFX11-NEXT: v_readfirstlane_b32 s7, v18 +; GFX11-NEXT: v_readfirstlane_b32 s8, v19 +; GFX11-NEXT: v_readfirstlane_b32 s9, v20 +; GFX11-NEXT: v_readfirstlane_b32 s10, v21 +; GFX11-NEXT: v_readfirstlane_b32 s11, v22 +; GFX11-NEXT: v_readfirstlane_b32 s12, v23 +; GFX11-NEXT: v_readfirstlane_b32 s13, v24 +; GFX11-NEXT: v_readfirstlane_b32 s14, v25 +; GFX11-NEXT: v_readfirstlane_b32 s15, v26 +; GFX11-NEXT: v_readfirstlane_b32 s16, v27 +; GFX11-NEXT: v_readfirstlane_b32 s17, v28 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s27, v8 +; GFX11-NEXT: v_readfirstlane_b32 s26, v9 ; GFX11-NEXT: s_mov_b32 s90, 0 -; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -31892,20 +32464,8 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s13, s13, 3 -; GFX11-NEXT: s_addc_u32 s12, s12, 0 -; GFX11-NEXT: s_add_u32 s10, s10, 3 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s27, s27, 3 +; GFX11-NEXT: s_addc_u32 s26, s26, 0 ; GFX11-NEXT: s_add_u32 s24, s24, 3 ; GFX11-NEXT: s_addc_u32 s25, s25, 0 ; GFX11-NEXT: s_add_u32 s22, s22, 3 @@ -31916,34 +32476,46 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s14, s12, 16 -; GFX11-NEXT: s_lshr_b32 s15, s13, 16 -; GFX11-NEXT: s_lshr_b32 s40, s11, 16 -; GFX11-NEXT: s_lshr_b32 s41, s10, 16 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s43, s8, 16 -; GFX11-NEXT: s_lshr_b32 s44, s7, 16 -; GFX11-NEXT: s_lshr_b32 s45, s6, 16 -; GFX11-NEXT: s_lshr_b32 s46, s5, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 16 -; GFX11-NEXT: s_lshr_b32 s56, s29, 16 -; GFX11-NEXT: s_lshr_b32 s57, s28, 16 -; GFX11-NEXT: s_lshr_b32 s58, s27, 16 -; GFX11-NEXT: s_lshr_b32 s59, s26, 16 -; GFX11-NEXT: s_lshr_b32 s60, s25, 16 -; GFX11-NEXT: s_lshr_b32 s61, s24, 16 -; GFX11-NEXT: s_lshr_b32 s62, s23, 16 -; GFX11-NEXT: s_lshr_b32 s63, s22, 16 -; GFX11-NEXT: s_lshr_b32 s72, s21, 16 -; GFX11-NEXT: s_lshr_b32 s73, s20, 16 -; GFX11-NEXT: s_lshr_b32 s74, s19, 16 -; GFX11-NEXT: s_lshr_b32 s75, s18, 16 -; GFX11-NEXT: s_lshr_b32 s76, s17, 16 -; GFX11-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-NEXT: s_lshr_b32 s28, s26, 16 +; GFX11-NEXT: s_lshr_b32 s29, s27, 16 +; GFX11-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-NEXT: s_lshr_b32 s58, s15, 16 +; GFX11-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-NEXT: s_lshr_b32 s60, s13, 16 +; GFX11-NEXT: s_lshr_b32 s61, s12, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 16 +; GFX11-NEXT: s_lshr_b32 s63, s10, 16 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: s_lshr_b32 s73, s8, 16 +; GFX11-NEXT: s_lshr_b32 s74, s7, 16 +; GFX11-NEXT: s_lshr_b32 s75, s6, 16 +; GFX11-NEXT: s_lshr_b32 s76, s5, 16 +; GFX11-NEXT: s_lshr_b32 s77, s4, 16 ; GFX11-NEXT: s_lshr_b32 s78, s3, 16 ; GFX11-NEXT: s_lshr_b32 s79, s2, 16 ; GFX11-NEXT: s_lshr_b32 s88, s1, 16 @@ -31954,44 +32526,44 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s88 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s79 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s29 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s28 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s27 :: v_dual_mov_b32 v27, s26 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr89 @@ -32020,8 +32592,8 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr14 +; GFX11-NEXT: ; implicit-def: $sgpr29 +; GFX11-NEXT: ; implicit-def: $sgpr28 ; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -36088,8 +36660,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v15, s18 ; VI-NEXT: v_mov_b32_e32 v16, s19 ; VI-NEXT: v_mov_b32_e32 v32, s20 @@ -36101,8 +36673,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v22, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v17, s28 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -36127,8 +36699,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -36139,8 +36711,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -36150,13 +36722,13 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -36171,8 +36743,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -36183,15 +36755,15 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v17, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 @@ -36212,11 +36784,11 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v20, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -36301,8 +36873,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v15, s18 ; GFX9-NEXT: v_mov_b32_e32 v16, s19 ; GFX9-NEXT: v_mov_b32_e32 v32, s20 @@ -36314,8 +36886,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v22, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v17, s28 -; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -36340,8 +36912,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -36352,8 +36924,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -36363,13 +36935,13 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -36384,8 +36956,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -36396,8 +36968,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 @@ -36419,25 +36991,25 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 @@ -36693,9 +37265,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 @@ -36710,12 +37282,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 @@ -36736,9 +37308,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 @@ -36755,12 +37327,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 @@ -36774,56 +37346,54 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; GFX11-FAKE16-NEXT: .LBB49_3: ; %end -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v18 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 @@ -37352,7 +37922,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -40598,21 +41168,21 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s40, v1 -; SI-NEXT: v_readfirstlane_b32 s41, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s9, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s4, v13 -; SI-NEXT: s_and_b64 s[42:43], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -40631,401 +41201,502 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s42, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 -; SI-NEXT: s_lshr_b32 s42, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s42 -; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: s_lshr_b32 s42, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: s_lshr_b32 s42, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: s_lshr_b32 s42, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 -; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 -; SI-NEXT: s_lshr_b32 s42, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 -; SI-NEXT: s_lshr_b32 s42, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s42 -; SI-NEXT: s_lshr_b32 s42, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s42 -; SI-NEXT: s_lshr_b32 s42, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 -; SI-NEXT: s_lshr_b32 s42, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s42 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s42 -; SI-NEXT: s_lshr_b32 s42, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s42 -; SI-NEXT: s_lshr_b32 s42, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s42 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: s_lshr_b32 s42, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s42 -; SI-NEXT: s_lshr_b32 s42, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s42 -; SI-NEXT: s_lshr_b32 s42, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s42 -; SI-NEXT: s_lshr_b32 s42, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s42 -; SI-NEXT: s_lshr_b32 s42, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 -; SI-NEXT: s_lshr_b32 s42, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s9 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[42:43], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v42 -; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 -; SI-NEXT: v_add_f64 v[53:54], s[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 -; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_f64 v[53:54], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_mov_b32_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v29, v14 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v12 +; SI-NEXT: v_mov_b32_e32 v43, v13 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -41049,70 +41720,91 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v19, s16 -; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v15, s18 ; VI-NEXT: v_mov_b32_e32 v16, s19 ; VI-NEXT: v_mov_b32_e32 v32, s20 @@ -41124,8 +41816,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v22, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v17, s28 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -41150,8 +41842,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -41162,8 +41854,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -41173,13 +41865,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -41194,8 +41886,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -41206,15 +41898,15 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v17, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 @@ -41235,11 +41927,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v20, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -41324,8 +42016,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v19, s16 -; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v15, s18 ; GFX9-NEXT: v_mov_b32_e32 v16, s19 ; GFX9-NEXT: v_mov_b32_e32 v32, s20 @@ -41337,8 +42029,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v22, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v17, s28 -; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -41363,8 +42055,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -41375,8 +42067,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -41386,13 +42078,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -41407,8 +42099,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 @@ -41419,8 +42111,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 @@ -41442,25 +42134,25 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 @@ -41716,9 +42408,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v16, s25 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 @@ -41733,12 +42425,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 @@ -41759,9 +42451,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 @@ -41778,12 +42470,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 @@ -41797,56 +42489,54 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; GFX11-FAKE16-NEXT: .LBB53_3: ; %end -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v18 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index c9e5771240078..37f049de7a633 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -3848,30 +3848,58 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v18, s30, 0 -; SI-NEXT: v_writelane_b32 v18, s31, 1 -; SI-NEXT: v_writelane_b32 v18, s34, 2 -; SI-NEXT: v_writelane_b32 v18, s35, 3 -; SI-NEXT: v_writelane_b32 v18, s36, 4 -; SI-NEXT: v_writelane_b32 v18, s37, 5 -; SI-NEXT: v_writelane_b32 v18, s38, 6 -; SI-NEXT: v_writelane_b32 v18, s39, 7 -; SI-NEXT: v_writelane_b32 v18, s48, 8 -; SI-NEXT: v_writelane_b32 v18, s49, 9 -; SI-NEXT: v_writelane_b32 v18, s50, 10 -; SI-NEXT: v_writelane_b32 v18, s51, 11 -; SI-NEXT: v_writelane_b32 v18, s52, 12 -; SI-NEXT: v_writelane_b32 v18, s53, 13 -; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v18 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_writelane_b32 v18, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s42, v1 -; SI-NEXT: v_readfirstlane_b32 s43, v2 -; SI-NEXT: v_readfirstlane_b32 s40, v3 -; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_readfirstlane_b32 s19, v2 +; SI-NEXT: v_readfirstlane_b32 s16, v3 +; SI-NEXT: v_readfirstlane_b32 s17, v4 ; SI-NEXT: v_readfirstlane_b32 s14, v5 ; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: v_readfirstlane_b32 s12, v7 @@ -3883,9 +3911,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v13 ; SI-NEXT: v_readfirstlane_b32 s7, v14 ; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s34, s5, 16 @@ -3894,50 +3922,50 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s37, s11, 16 ; SI-NEXT: s_lshr_b32 s38, s13, 16 ; SI-NEXT: s_lshr_b32 s39, s15, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 16 -; SI-NEXT: s_lshr_b32 s49, s43, 16 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 ; SI-NEXT: s_lshr_b32 s52, s25, 16 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s19, 16 -; SI-NEXT: s_lshr_b32 s64, s17, 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -3950,149 +3978,149 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 ; SI-NEXT: s_lshr_b32 s34, s5, 16 ; SI-NEXT: s_lshr_b32 s35, s7, 16 ; SI-NEXT: s_lshr_b32 s36, s9, 16 ; SI-NEXT: s_lshr_b32 s37, s11, 16 ; SI-NEXT: s_lshr_b32 s38, s13, 16 ; SI-NEXT: s_lshr_b32 s39, s15, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 16 -; SI-NEXT: s_lshr_b32 s49, s43, 16 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 ; SI-NEXT: s_lshr_b32 s52, s25, 16 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s19, 16 -; SI-NEXT: s_lshr_b32 s64, s17, 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s45, s30, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s45 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s64, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s94, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s92, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s64, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s54, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v6, s27 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s16, s90, 16 -; SI-NEXT: s_and_b32 s17, s22, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s53, 16 ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s51, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s50, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s49, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s48, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -4156,7 +4184,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -4170,7 +4198,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s44, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -4184,25 +4212,25 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s64, v18, 16 -; SI-NEXT: v_readlane_b32 s55, v18, 15 -; SI-NEXT: v_readlane_b32 s54, v18, 14 -; SI-NEXT: v_readlane_b32 s53, v18, 13 -; SI-NEXT: v_readlane_b32 s52, v18, 12 -; SI-NEXT: v_readlane_b32 s51, v18, 11 -; SI-NEXT: v_readlane_b32 s50, v18, 10 -; SI-NEXT: v_readlane_b32 s49, v18, 9 -; SI-NEXT: v_readlane_b32 s48, v18, 8 -; SI-NEXT: v_readlane_b32 s39, v18, 7 -; SI-NEXT: v_readlane_b32 s38, v18, 6 -; SI-NEXT: v_readlane_b32 s37, v18, 5 -; SI-NEXT: v_readlane_b32 s36, v18, 4 -; SI-NEXT: v_readlane_b32 s35, v18, 3 -; SI-NEXT: v_readlane_b32 s34, v18, 2 -; SI-NEXT: v_readlane_b32 s31, v18, 1 -; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4233,10 +4261,10 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: @@ -4247,18 +4275,46 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v30, s30, 0 ; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_readfirstlane_b32 s56, v17 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_readfirstlane_b32 s47, v18 +; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_readfirstlane_b32 s46, v19 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v18 +; VI-NEXT: v_mov_b32_e32 v18, s23 ; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_readfirstlane_b32 s43, v19 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v17 +; VI-NEXT: v_mov_b32_e32 v17, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v18 +; VI-NEXT: v_mov_b32_e32 v18, s26 ; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_readfirstlane_b32 s40, v19 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_writelane_b32 v30, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s45, v0 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s43, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s41, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v0 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s17, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 ; VI-NEXT: v_readfirstlane_b32 s15, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s13, v8 @@ -4273,9 +4329,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -4283,26 +4339,26 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -4315,29 +4371,29 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_add_i32 s42, s42, 3 ; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s44, s44, 3 ; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: s_add_i32 s56, s56, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -4345,137 +4401,137 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s56 ; VI-NEXT: s_lshl_b32 s5, s39, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s38, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s37, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s36, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s35, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s34, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s31, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s30, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s91, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s90, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s89, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s88, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s79, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s78, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s45 -; VI-NEXT: s_lshl_b32 s29, s77, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s44 -; VI-NEXT: s_lshl_b32 s44, s76, 16 -; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s5, 0xffff, s47 +; VI-NEXT: s_lshl_b32 s47, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s47 +; VI-NEXT: s_and_b32 s46, 0xffff, s46 +; VI-NEXT: s_lshl_b32 s47, s37, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s45, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s47, s36, 16 +; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s47, s35, 16 +; VI-NEXT: s_or_b32 s44, s44, s47 ; VI-NEXT: s_and_b32 s43, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s44, s75, 16 -; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_lshl_b32 s47, s34, 16 +; VI-NEXT: s_or_b32 s43, s43, s47 ; VI-NEXT: s_and_b32 s42, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s44, s74, 16 -; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_lshl_b32 s47, s31, 16 +; VI-NEXT: s_or_b32 s42, s42, s47 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s44, s73, 16 -; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_lshl_b32 s47, s30, 16 +; VI-NEXT: s_or_b32 s41, s41, s47 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s44, s72, 16 -; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_lshl_b32 s47, s91, 16 +; VI-NEXT: s_or_b32 s40, s40, s47 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s47, s90, 16 +; VI-NEXT: s_or_b32 s26, s26, s47 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s47, s89, 16 +; VI-NEXT: s_or_b32 s25, s25, s47 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s47, s88, 16 +; VI-NEXT: s_or_b32 s24, s24, s47 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s47, s79, 16 +; VI-NEXT: s_or_b32 s23, s23, s47 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s47, s78, 16 +; VI-NEXT: s_or_b32 s22, s22, s47 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s47, s77, 16 +; VI-NEXT: s_or_b32 s21, s21, s47 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s76, 16 +; VI-NEXT: s_or_b32 s20, s20, s47 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s47, s75, 16 +; VI-NEXT: s_or_b32 s19, s19, s47 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s47, s74, 16 +; VI-NEXT: s_or_b32 s18, s18, s47 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s47, s73, 16 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s47, s72, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s44, s63, 16 -; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_lshl_b32 s47, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s47 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s44, s62, 16 -; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_lshl_b32 s47, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s47 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s44, s61, 16 -; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_lshl_b32 s47, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s47 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s44, s60, 16 -; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_lshl_b32 s47, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s47 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s44, s59, 16 -; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_lshl_b32 s47, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s47 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s44, s58, 16 -; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_lshl_b32 s47, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s47 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s44, s57, 16 -; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_lshl_b32 s47, s57, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s44, s56, 16 -; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 -; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s47 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s43 -; VI-NEXT: v_mov_b32_e32 v17, s42 -; VI-NEXT: v_mov_b32_e32 v18, s41 -; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v2, s46 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s44 +; VI-NEXT: v_mov_b32_e32 v5, s43 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s41 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v20, s15 ; VI-NEXT: v_mov_b32_e32 v21, s14 ; VI-NEXT: v_mov_b32_e32 v22, s13 @@ -4527,9 +4583,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v30i32_to_v60i16_scalar: @@ -4538,20 +4594,48 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_readfirstlane_b32 s8, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_readfirstlane_b32 s9, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_readfirstlane_b32 s10, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_readfirstlane_b32 s11, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s25 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s26 ; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_readfirstlane_b32 s15, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s17, v19 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -4568,30 +4652,30 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s45, s45, 3 @@ -4600,16 +4684,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s42, s42, 3 ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -4624,61 +4698,71 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 ; GFX9-NEXT: s_lshr_b32 s47, s44, 16 ; GFX9-NEXT: s_lshr_b32 s56, s43, 16 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s60 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 @@ -4687,28 +4771,28 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -4760,49 +4844,76 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-NEXT: v_dual_mov_b32 v17, s16 :: v_dual_mov_b32 v18, s17 +; GFX11-NEXT: v_dual_mov_b32 v19, s18 :: v_dual_mov_b32 v20, s19 +; GFX11-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v24, s23 +; GFX11-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v26, s25 +; GFX11-NEXT: v_dual_mov_b32 v27, s26 :: v_dual_mov_b32 v28, s27 +; GFX11-NEXT: v_dual_mov_b32 v29, s28 :: v_dual_mov_b32 v30, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 -; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 +; GFX11-NEXT: v_readfirstlane_b32 s2, v15 +; GFX11-NEXT: v_readfirstlane_b32 s3, v16 +; GFX11-NEXT: v_readfirstlane_b32 s4, v17 +; GFX11-NEXT: v_readfirstlane_b32 s5, v18 +; GFX11-NEXT: v_readfirstlane_b32 s6, v19 +; GFX11-NEXT: v_readfirstlane_b32 s7, v20 +; GFX11-NEXT: v_readfirstlane_b32 s8, v21 +; GFX11-NEXT: v_readfirstlane_b32 s9, v22 +; GFX11-NEXT: v_readfirstlane_b32 s10, v23 +; GFX11-NEXT: v_readfirstlane_b32 s11, v24 +; GFX11-NEXT: v_readfirstlane_b32 s12, v25 +; GFX11-NEXT: v_readfirstlane_b32 s13, v26 +; GFX11-NEXT: v_readfirstlane_b32 s14, v27 +; GFX11-NEXT: v_readfirstlane_b32 s15, v28 +; GFX11-NEXT: v_readfirstlane_b32 s16, v29 +; GFX11-NEXT: v_readfirstlane_b32 s17, v30 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s26, v8 +; GFX11-NEXT: v_readfirstlane_b32 s27, v9 +; GFX11-NEXT: v_readfirstlane_b32 s29, v10 +; GFX11-NEXT: v_readfirstlane_b32 s28, v11 ; GFX11-NEXT: s_mov_b32 s94, 0 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -4810,20 +4921,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 ; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s29, s29, 3 ; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s26, s26, 3 ; GFX11-NEXT: s_add_i32 s25, s25, 3 @@ -4836,36 +4935,48 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -4876,47 +4987,47 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s92 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s91 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s90 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s89 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s88 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s79 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s89 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s88 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s79 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s78 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v28, s29 :: v_dual_mov_b32 v29, s28 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -5467,7 +5578,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB14_4: ; %end @@ -9050,13 +9161,41 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s45, v1 -; SI-NEXT: v_readfirstlane_b32 s44, v2 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s41, v5 -; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s29, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s20, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 ; SI-NEXT: v_readfirstlane_b32 s15, v7 ; SI-NEXT: v_readfirstlane_b32 s14, v8 ; SI-NEXT: v_readfirstlane_b32 s13, v9 @@ -9103,48 +9242,48 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -9157,35 +9296,34 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 @@ -9193,12 +9331,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 @@ -9209,26 +9348,26 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s46, s18, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_lshr_b32 s56, s20, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 -; SI-NEXT: s_lshr_b32 s58, s22, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 -; SI-NEXT: s_lshr_b32 s60, s24, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_lshr_b32 s62, s26, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s28, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_lshr_b32 s74, s45, 16 -; SI-NEXT: s_lshr_b32 s75, s44, 16 -; SI-NEXT: s_lshr_b32 s76, s43, 16 -; SI-NEXT: s_lshr_b32 s77, s42, 16 -; SI-NEXT: s_lshr_b32 s78, s41, 16 -; SI-NEXT: s_lshr_b32 s79, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s5, s41, 16 +; SI-NEXT: s_lshr_b32 s46, s42, 16 +; SI-NEXT: s_lshr_b32 s47, s43, 16 +; SI-NEXT: s_lshr_b32 s56, s44, 16 +; SI-NEXT: s_lshr_b32 s57, s45, 16 +; SI-NEXT: s_lshr_b32 s58, s23, 16 +; SI-NEXT: s_lshr_b32 s59, s24, 16 +; SI-NEXT: s_lshr_b32 s60, s25, 16 +; SI-NEXT: s_lshr_b32 s61, s26, 16 +; SI-NEXT: s_lshr_b32 s62, s27, 16 +; SI-NEXT: s_lshr_b32 s63, s28, 16 +; SI-NEXT: s_lshr_b32 s72, s29, 16 +; SI-NEXT: s_lshr_b32 s73, s22, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s20, 16 +; SI-NEXT: s_lshr_b32 s76, s19, 16 +; SI-NEXT: s_lshr_b32 s77, s18, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s16, 16 ; SI-NEXT: s_lshr_b32 s88, s15, 16 ; SI-NEXT: s_lshr_b32 s89, s14, 16 ; SI-NEXT: s_lshr_b32 s90, s13, 16 @@ -9249,29 +9388,29 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s41 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo ; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 @@ -9599,18 +9738,46 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v30, s30, 0 ; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_readfirstlane_b32 s56, v17 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_readfirstlane_b32 s47, v18 +; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_readfirstlane_b32 s46, v19 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v18 +; VI-NEXT: v_mov_b32_e32 v18, s23 ; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_readfirstlane_b32 s43, v19 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v17 +; VI-NEXT: v_mov_b32_e32 v17, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v18 +; VI-NEXT: v_mov_b32_e32 v18, s26 ; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_readfirstlane_b32 s40, v19 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_writelane_b32 v30, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s45, v0 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s43, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s41, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v0 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s17, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 ; VI-NEXT: v_readfirstlane_b32 s15, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s13, v8 @@ -9625,9 +9792,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -9635,26 +9802,26 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true ; VI-NEXT: s_add_i32 s7, s7, 3 @@ -9667,29 +9834,29 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s13, s13, 3 ; VI-NEXT: s_add_i32 s14, s14, 3 ; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_add_i32 s42, s42, 3 ; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s44, s44, 3 ; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: s_add_i32 s56, s56, 3 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -9697,137 +9864,137 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: .LBB17_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s56 ; VI-NEXT: s_lshl_b32 s5, s39, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s38, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s37, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s36, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s35, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s34, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s31, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s30, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s91, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s90, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s89, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s88, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s79, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s78, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s45 -; VI-NEXT: s_lshl_b32 s29, s77, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s44 -; VI-NEXT: s_lshl_b32 s44, s76, 16 -; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s5, 0xffff, s47 +; VI-NEXT: s_lshl_b32 s47, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s47 +; VI-NEXT: s_and_b32 s46, 0xffff, s46 +; VI-NEXT: s_lshl_b32 s47, s37, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s45, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s47, s36, 16 +; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s47, s35, 16 +; VI-NEXT: s_or_b32 s44, s44, s47 ; VI-NEXT: s_and_b32 s43, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s44, s75, 16 -; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_lshl_b32 s47, s34, 16 +; VI-NEXT: s_or_b32 s43, s43, s47 ; VI-NEXT: s_and_b32 s42, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s44, s74, 16 -; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_lshl_b32 s47, s31, 16 +; VI-NEXT: s_or_b32 s42, s42, s47 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s44, s73, 16 -; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_lshl_b32 s47, s30, 16 +; VI-NEXT: s_or_b32 s41, s41, s47 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s44, s72, 16 -; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_lshl_b32 s47, s91, 16 +; VI-NEXT: s_or_b32 s40, s40, s47 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s47, s90, 16 +; VI-NEXT: s_or_b32 s26, s26, s47 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s47, s89, 16 +; VI-NEXT: s_or_b32 s25, s25, s47 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s47, s88, 16 +; VI-NEXT: s_or_b32 s24, s24, s47 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s47, s79, 16 +; VI-NEXT: s_or_b32 s23, s23, s47 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s47, s78, 16 +; VI-NEXT: s_or_b32 s22, s22, s47 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s47, s77, 16 +; VI-NEXT: s_or_b32 s21, s21, s47 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s76, 16 +; VI-NEXT: s_or_b32 s20, s20, s47 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s47, s75, 16 +; VI-NEXT: s_or_b32 s19, s19, s47 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s47, s74, 16 +; VI-NEXT: s_or_b32 s18, s18, s47 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s47, s73, 16 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s47, s72, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s44, s63, 16 -; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_lshl_b32 s47, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s47 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s44, s62, 16 -; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_lshl_b32 s47, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s47 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s44, s61, 16 -; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_lshl_b32 s47, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s47 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s44, s60, 16 -; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_lshl_b32 s47, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s47 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s44, s59, 16 -; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_lshl_b32 s47, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s47 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s44, s58, 16 -; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_lshl_b32 s47, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s47 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s44, s57, 16 -; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_lshl_b32 s47, s57, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s44, s56, 16 -; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 -; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s47 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s43 -; VI-NEXT: v_mov_b32_e32 v17, s42 -; VI-NEXT: v_mov_b32_e32 v18, s41 -; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v2, s46 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s44 +; VI-NEXT: v_mov_b32_e32 v5, s43 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s41 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v20, s15 ; VI-NEXT: v_mov_b32_e32 v21, s14 ; VI-NEXT: v_mov_b32_e32 v22, s13 @@ -9879,9 +10046,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_v30i32_to_v60f16_scalar: @@ -9890,20 +10057,48 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_readfirstlane_b32 s8, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_readfirstlane_b32 s9, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_readfirstlane_b32 s10, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_readfirstlane_b32 s11, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s25 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s26 ; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_readfirstlane_b32 s15, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s17, v19 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -9920,30 +10115,30 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s45, s45, 3 @@ -9952,16 +10147,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s42, s42, 3 ; GFX9-NEXT: s_add_i32 s41, s41, 3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 -; GFX9-NEXT: s_add_i32 s15, s15, 3 -; GFX9-NEXT: s_add_i32 s14, s14, 3 -; GFX9-NEXT: s_add_i32 s13, s13, 3 -; GFX9-NEXT: s_add_i32 s12, s12, 3 -; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: s_add_i32 s10, s10, 3 -; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -9976,61 +10161,71 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 ; GFX9-NEXT: s_lshr_b32 s47, s44, 16 ; GFX9-NEXT: s_lshr_b32 s56, s43, 16 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: .LBB17_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s60 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 @@ -10039,28 +10234,28 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -10112,49 +10307,76 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-LABEL: bitcast_v30i32_to_v60f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-NEXT: v_dual_mov_b32 v17, s16 :: v_dual_mov_b32 v18, s17 +; GFX11-NEXT: v_dual_mov_b32 v19, s18 :: v_dual_mov_b32 v20, s19 +; GFX11-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v24, s23 +; GFX11-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v26, s25 +; GFX11-NEXT: v_dual_mov_b32 v27, s26 :: v_dual_mov_b32 v28, s27 +; GFX11-NEXT: v_dual_mov_b32 v29, s28 :: v_dual_mov_b32 v30, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 -; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 +; GFX11-NEXT: v_readfirstlane_b32 s2, v15 +; GFX11-NEXT: v_readfirstlane_b32 s3, v16 +; GFX11-NEXT: v_readfirstlane_b32 s4, v17 +; GFX11-NEXT: v_readfirstlane_b32 s5, v18 +; GFX11-NEXT: v_readfirstlane_b32 s6, v19 +; GFX11-NEXT: v_readfirstlane_b32 s7, v20 +; GFX11-NEXT: v_readfirstlane_b32 s8, v21 +; GFX11-NEXT: v_readfirstlane_b32 s9, v22 +; GFX11-NEXT: v_readfirstlane_b32 s10, v23 +; GFX11-NEXT: v_readfirstlane_b32 s11, v24 +; GFX11-NEXT: v_readfirstlane_b32 s12, v25 +; GFX11-NEXT: v_readfirstlane_b32 s13, v26 +; GFX11-NEXT: v_readfirstlane_b32 s14, v27 +; GFX11-NEXT: v_readfirstlane_b32 s15, v28 +; GFX11-NEXT: v_readfirstlane_b32 s16, v29 +; GFX11-NEXT: v_readfirstlane_b32 s17, v30 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s26, v8 +; GFX11-NEXT: v_readfirstlane_b32 s27, v9 +; GFX11-NEXT: v_readfirstlane_b32 s29, v10 +; GFX11-NEXT: v_readfirstlane_b32 s28, v11 ; GFX11-NEXT: s_mov_b32 s94, 0 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -10162,20 +10384,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 ; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s5, s5, 3 -; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: s_add_i32 s29, s29, 3 ; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s29, s29, 3 ; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s26, s26, 3 ; GFX11-NEXT: s_add_i32 s25, s25, 3 @@ -10188,36 +10398,48 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s11, s11, 3 +; GFX11-NEXT: s_add_i32 s10, s10, 3 +; GFX11-NEXT: s_add_i32 s9, s9, 3 +; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: s_add_i32 s6, s6, 3 +; GFX11-NEXT: s_add_i32 s5, s5, 3 +; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -10228,47 +10450,47 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s92 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s91 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s90 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s89 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s88 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s79 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s89 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s88 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s79 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s78 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v28, s29 :: v_dual_mov_b32 v29, s28 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -17994,7 +18216,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB30_4: ; %end @@ -21548,23 +21770,21 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s45, v1 -; SI-NEXT: v_readfirstlane_b32 s44, v2 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s41, v5 -; SI-NEXT: v_readfirstlane_b32 s40, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v29, s17 +; SI-NEXT: v_mov_b32_e32 v50, s18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_mov_b32_e32 v51, s19 +; SI-NEXT: v_mov_b32_e32 v49, s20 +; SI-NEXT: v_mov_b32_e32 v48, s21 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v38, s23 +; SI-NEXT: v_mov_b32_e32 v37, s24 +; SI-NEXT: v_mov_b32_e32 v36, s25 +; SI-NEXT: v_mov_b32_e32 v35, s26 +; SI-NEXT: v_mov_b32_e32 v34, s27 +; SI-NEXT: v_mov_b32_e32 v33, s28 +; SI-NEXT: v_mov_b32_e32 v32, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -21583,453 +21803,577 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v6, s16, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_add_f32_e64 v40, s6, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s45, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_add_f32_e64 v19, s26, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_add_f32_e64 v48, s8, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 -; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_add_f32_e64 v18, s13, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 -; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 -; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v44, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 -; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v61 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -22048,66 +22392,93 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; kill: killed $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr52 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: @@ -28034,30 +28405,58 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v18, s30, 0 -; SI-NEXT: v_writelane_b32 v18, s31, 1 -; SI-NEXT: v_writelane_b32 v18, s34, 2 -; SI-NEXT: v_writelane_b32 v18, s35, 3 -; SI-NEXT: v_writelane_b32 v18, s36, 4 -; SI-NEXT: v_writelane_b32 v18, s37, 5 -; SI-NEXT: v_writelane_b32 v18, s38, 6 -; SI-NEXT: v_writelane_b32 v18, s39, 7 -; SI-NEXT: v_writelane_b32 v18, s48, 8 -; SI-NEXT: v_writelane_b32 v18, s49, 9 -; SI-NEXT: v_writelane_b32 v18, s50, 10 -; SI-NEXT: v_writelane_b32 v18, s51, 11 -; SI-NEXT: v_writelane_b32 v18, s52, 12 -; SI-NEXT: v_writelane_b32 v18, s53, 13 -; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v18 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_writelane_b32 v18, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s42, v1 -; SI-NEXT: v_readfirstlane_b32 s43, v2 -; SI-NEXT: v_readfirstlane_b32 s40, v3 -; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_readfirstlane_b32 s19, v2 +; SI-NEXT: v_readfirstlane_b32 s16, v3 +; SI-NEXT: v_readfirstlane_b32 s17, v4 ; SI-NEXT: v_readfirstlane_b32 s14, v5 ; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: v_readfirstlane_b32 s12, v7 @@ -28069,9 +28468,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s6, v13 ; SI-NEXT: v_readfirstlane_b32 s7, v14 ; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: s_and_b64 s[26:27], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s34, s5, 16 @@ -28080,30 +28479,30 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s37, s11, 16 ; SI-NEXT: s_lshr_b32 s38, s13, 16 ; SI-NEXT: s_lshr_b32 s39, s15, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 16 -; SI-NEXT: s_lshr_b32 s49, s43, 16 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 ; SI-NEXT: s_lshr_b32 s52, s25, 16 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s19, 16 -; SI-NEXT: s_lshr_b32 s64, s17, 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -28118,167 +28517,167 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_add_u32 s40, s40, 3 ; SI-NEXT: s_addc_u32 s41, s41, 0 ; SI-NEXT: s_add_u32 s42, s42, 3 ; SI-NEXT: s_addc_u32 s43, s43, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 ; SI-NEXT: s_lshr_b32 s34, s5, 16 ; SI-NEXT: s_lshr_b32 s35, s7, 16 ; SI-NEXT: s_lshr_b32 s36, s9, 16 ; SI-NEXT: s_lshr_b32 s37, s11, 16 ; SI-NEXT: s_lshr_b32 s38, s13, 16 ; SI-NEXT: s_lshr_b32 s39, s15, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 16 -; SI-NEXT: s_lshr_b32 s49, s43, 16 -; SI-NEXT: s_lshr_b32 s50, s29, 16 -; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 ; SI-NEXT: s_lshr_b32 s52, s25, 16 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s21, 16 -; SI-NEXT: s_lshr_b32 s55, s19, 16 -; SI-NEXT: s_lshr_b32 s64, s17, 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s45, s30, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s45 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s64, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_lshl_b32 s16, s94, 16 -; SI-NEXT: s_and_b32 s17, s18, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: s_lshl_b32 s16, s92, 16 -; SI-NEXT: s_and_b32 s17, s20, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_mov_b32_e32 v5, s16 -; SI-NEXT: s_and_b32 s16, s21, 0xffff -; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s64, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s54, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v6, s27 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s16, s90, 16 -; SI-NEXT: s_and_b32 s17, s22, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_or_b32 s27, s29, s27 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s41, 0xffff +; SI-NEXT: s_lshl_b32 s29, s53, 16 ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s24, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s25, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s26, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s51, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s28, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s76, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s29, 0xffff -; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s50, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s74, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s49, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_and_b32 s16, s17, 0xffff ; SI-NEXT: s_lshl_b32 s17, s48, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -28342,7 +28741,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -28356,7 +28755,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s44, 16 +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -28370,25 +28769,25 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s64, v18, 16 -; SI-NEXT: v_readlane_b32 s55, v18, 15 -; SI-NEXT: v_readlane_b32 s54, v18, 14 -; SI-NEXT: v_readlane_b32 s53, v18, 13 -; SI-NEXT: v_readlane_b32 s52, v18, 12 -; SI-NEXT: v_readlane_b32 s51, v18, 11 -; SI-NEXT: v_readlane_b32 s50, v18, 10 -; SI-NEXT: v_readlane_b32 s49, v18, 9 -; SI-NEXT: v_readlane_b32 s48, v18, 8 -; SI-NEXT: v_readlane_b32 s39, v18, 7 -; SI-NEXT: v_readlane_b32 s38, v18, 6 -; SI-NEXT: v_readlane_b32 s37, v18, 5 -; SI-NEXT: v_readlane_b32 s36, v18, 4 -; SI-NEXT: v_readlane_b32 s35, v18, 3 -; SI-NEXT: v_readlane_b32 s34, v18, 2 -; SI-NEXT: v_readlane_b32 s31, v18, 1 -; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -28419,10 +28818,10 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: @@ -28433,18 +28832,46 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v30, s30, 0 ; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_readfirstlane_b32 s56, v17 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_readfirstlane_b32 s47, v18 +; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_readfirstlane_b32 s46, v19 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v18 +; VI-NEXT: v_mov_b32_e32 v18, s23 ; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_readfirstlane_b32 s43, v19 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v17 +; VI-NEXT: v_mov_b32_e32 v17, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v18 +; VI-NEXT: v_mov_b32_e32 v18, s26 ; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_readfirstlane_b32 s40, v19 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_writelane_b32 v30, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s45, v0 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s43, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s41, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v0 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s17, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 ; VI-NEXT: v_readfirstlane_b32 s15, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s13, v8 @@ -28459,9 +28886,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -28469,26 +28896,26 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -28501,29 +28928,29 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s43, s43, 3 -; VI-NEXT: s_addc_u32 s42, s42, 0 -; VI-NEXT: s_add_u32 s45, s45, 3 -; VI-NEXT: s_addc_u32 s44, s44, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s46, s46, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_add_u32 s56, s56, 3 +; VI-NEXT: s_addc_u32 s47, s47, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -28531,137 +28958,137 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: .LBB41_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s56 ; VI-NEXT: s_lshl_b32 s5, s39, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s38, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s37, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s36, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s35, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s34, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s31, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s30, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s91, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s90, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s89, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s88, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s79, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s78, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s45 -; VI-NEXT: s_lshl_b32 s29, s77, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s44 -; VI-NEXT: s_lshl_b32 s44, s76, 16 -; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s5, 0xffff, s47 +; VI-NEXT: s_lshl_b32 s47, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s47 +; VI-NEXT: s_and_b32 s46, 0xffff, s46 +; VI-NEXT: s_lshl_b32 s47, s37, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s45, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s47, s36, 16 +; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s47, s35, 16 +; VI-NEXT: s_or_b32 s44, s44, s47 ; VI-NEXT: s_and_b32 s43, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s44, s75, 16 -; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_lshl_b32 s47, s34, 16 +; VI-NEXT: s_or_b32 s43, s43, s47 ; VI-NEXT: s_and_b32 s42, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s44, s74, 16 -; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_lshl_b32 s47, s31, 16 +; VI-NEXT: s_or_b32 s42, s42, s47 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s44, s73, 16 -; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_lshl_b32 s47, s30, 16 +; VI-NEXT: s_or_b32 s41, s41, s47 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s44, s72, 16 -; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_lshl_b32 s47, s91, 16 +; VI-NEXT: s_or_b32 s40, s40, s47 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s47, s90, 16 +; VI-NEXT: s_or_b32 s26, s26, s47 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s47, s89, 16 +; VI-NEXT: s_or_b32 s25, s25, s47 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s47, s88, 16 +; VI-NEXT: s_or_b32 s24, s24, s47 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s47, s79, 16 +; VI-NEXT: s_or_b32 s23, s23, s47 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s47, s78, 16 +; VI-NEXT: s_or_b32 s22, s22, s47 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s47, s77, 16 +; VI-NEXT: s_or_b32 s21, s21, s47 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s76, 16 +; VI-NEXT: s_or_b32 s20, s20, s47 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s47, s75, 16 +; VI-NEXT: s_or_b32 s19, s19, s47 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s47, s74, 16 +; VI-NEXT: s_or_b32 s18, s18, s47 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s47, s73, 16 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s47, s72, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s44, s63, 16 -; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_lshl_b32 s47, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s47 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s44, s62, 16 -; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_lshl_b32 s47, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s47 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s44, s61, 16 -; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_lshl_b32 s47, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s47 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s44, s60, 16 -; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_lshl_b32 s47, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s47 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s44, s59, 16 -; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_lshl_b32 s47, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s47 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s44, s58, 16 -; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_lshl_b32 s47, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s47 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s44, s57, 16 -; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_lshl_b32 s47, s57, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s44, s56, 16 -; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 -; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s47 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s43 -; VI-NEXT: v_mov_b32_e32 v17, s42 -; VI-NEXT: v_mov_b32_e32 v18, s41 -; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v2, s46 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s44 +; VI-NEXT: v_mov_b32_e32 v5, s43 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s41 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v20, s15 ; VI-NEXT: v_mov_b32_e32 v21, s14 ; VI-NEXT: v_mov_b32_e32 v22, s13 @@ -28713,9 +29140,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB41_2 ; ; GFX9-LABEL: bitcast_v15i64_to_v60i16_scalar: @@ -28724,20 +29151,48 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_readfirstlane_b32 s8, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_readfirstlane_b32 s9, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_readfirstlane_b32 s10, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_readfirstlane_b32 s11, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s25 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s26 ; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_readfirstlane_b32 s15, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s17, v19 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -28754,30 +29209,30 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s44, s44, 3 @@ -28786,16 +29241,6 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s43, s43, 0 ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -28810,61 +29255,71 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 ; GFX9-NEXT: s_lshr_b32 s47, s44, 16 ; GFX9-NEXT: s_lshr_b32 s56, s43, 16 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: .LBB41_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s60 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 @@ -28873,28 +29328,28 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -28946,49 +29401,76 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-LABEL: bitcast_v15i64_to_v60i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-NEXT: v_dual_mov_b32 v17, s16 :: v_dual_mov_b32 v18, s17 +; GFX11-NEXT: v_dual_mov_b32 v19, s18 :: v_dual_mov_b32 v20, s19 +; GFX11-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v24, s23 +; GFX11-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v26, s25 +; GFX11-NEXT: v_dual_mov_b32 v27, s26 :: v_dual_mov_b32 v28, s27 +; GFX11-NEXT: v_dual_mov_b32 v29, s28 :: v_dual_mov_b32 v30, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 -; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 +; GFX11-NEXT: v_readfirstlane_b32 s2, v15 +; GFX11-NEXT: v_readfirstlane_b32 s3, v16 +; GFX11-NEXT: v_readfirstlane_b32 s4, v17 +; GFX11-NEXT: v_readfirstlane_b32 s5, v18 +; GFX11-NEXT: v_readfirstlane_b32 s6, v19 +; GFX11-NEXT: v_readfirstlane_b32 s7, v20 +; GFX11-NEXT: v_readfirstlane_b32 s8, v21 +; GFX11-NEXT: v_readfirstlane_b32 s9, v22 +; GFX11-NEXT: v_readfirstlane_b32 s10, v23 +; GFX11-NEXT: v_readfirstlane_b32 s11, v24 +; GFX11-NEXT: v_readfirstlane_b32 s12, v25 +; GFX11-NEXT: v_readfirstlane_b32 s13, v26 +; GFX11-NEXT: v_readfirstlane_b32 s14, v27 +; GFX11-NEXT: v_readfirstlane_b32 s15, v28 +; GFX11-NEXT: v_readfirstlane_b32 s16, v29 +; GFX11-NEXT: v_readfirstlane_b32 s17, v30 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s26, v8 +; GFX11-NEXT: v_readfirstlane_b32 s27, v9 +; GFX11-NEXT: v_readfirstlane_b32 s29, v10 +; GFX11-NEXT: v_readfirstlane_b32 s28, v11 ; GFX11-NEXT: s_mov_b32 s94, 0 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -28996,20 +29478,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s15, s15, 3 -; GFX11-NEXT: s_addc_u32 s14, s14, 0 -; GFX11-NEXT: s_add_u32 s12, s12, 3 -; GFX11-NEXT: s_addc_u32 s13, s13, 0 -; GFX11-NEXT: s_add_u32 s10, s10, 3 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s29, s29, 3 +; GFX11-NEXT: s_addc_u32 s28, s28, 0 ; GFX11-NEXT: s_add_u32 s26, s26, 3 ; GFX11-NEXT: s_addc_u32 s27, s27, 0 ; GFX11-NEXT: s_add_u32 s24, s24, 3 @@ -29022,36 +29492,48 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -29062,47 +29544,47 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s92 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s91 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s90 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s89 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s88 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s79 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s89 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s88 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s79 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s78 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v28, s29 :: v_dual_mov_b32 v29, s28 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -29653,7 +30135,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB42_4: ; %end @@ -33253,13 +33735,41 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s42, v3 -; SI-NEXT: v_readfirstlane_b32 s43, v4 -; SI-NEXT: v_readfirstlane_b32 s40, v5 -; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: v_readfirstlane_b32 s14, v7 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s12, v9 @@ -33306,48 +33816,48 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s47, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 @@ -33360,68 +33870,68 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: s_lshr_b32 s17, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s46, s18, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s56, s20, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_add_u32 s4, s40, 3 +; SI-NEXT: s_addc_u32 s5, s43, 0 +; SI-NEXT: s_lshr_b32 s28, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 +; SI-NEXT: s_add_u32 s40, s41, 3 +; SI-NEXT: s_addc_u32 s41, s44, 0 +; SI-NEXT: s_lshr_b32 s43, s40, 16 +; SI-NEXT: s_lshr_b32 s44, s41, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s56, s42, 16 +; SI-NEXT: s_lshr_b32 s57, s45, 16 ; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_addc_u32 s46, s46, 0 ; SI-NEXT: s_lshr_b32 s58, s22, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s59, s46, 16 +; SI-NEXT: s_add_u32 s23, s23, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s61, s47, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s60, s24, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s62, s24, 16 ; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_add_u32 s28, s28, 3 -; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s72, s28, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_add_u32 s44, s44, 3 -; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_lshr_b32 s74, s44, 16 -; SI-NEXT: s_lshr_b32 s75, s45, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s43, s43, 0 -; SI-NEXT: s_lshr_b32 s76, s42, 16 -; SI-NEXT: s_lshr_b32 s77, s43, 16 -; SI-NEXT: s_add_u32 s40, s40, 3 -; SI-NEXT: s_addc_u32 s41, s41, 0 -; SI-NEXT: s_lshr_b32 s78, s40, 16 -; SI-NEXT: s_lshr_b32 s79, s41, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 +; SI-NEXT: s_addc_u32 s26, s26, 0 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s26, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s74, s20, 16 +; SI-NEXT: s_lshr_b32 s75, s21, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s76, s18, 16 +; SI-NEXT: s_lshr_b32 s77, s19, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s78, s16, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 ; SI-NEXT: s_add_u32 s14, s14, 3 ; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_lshr_b32 s88, s14, 16 @@ -33452,25 +33962,25 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 ; SI-NEXT: s_waitcnt expcnt(1) @@ -33501,11 +34011,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s29 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 @@ -33802,18 +34312,46 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v30, s30, 0 ; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_readfirstlane_b32 s56, v17 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_readfirstlane_b32 s47, v18 +; VI-NEXT: v_mov_b32_e32 v18, s20 ; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_readfirstlane_b32 s46, v19 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_mov_b32_e32 v17, s22 +; VI-NEXT: v_readfirstlane_b32 s44, v18 +; VI-NEXT: v_mov_b32_e32 v18, s23 ; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_readfirstlane_b32 s43, v19 +; VI-NEXT: v_mov_b32_e32 v19, s24 +; VI-NEXT: v_readfirstlane_b32 s42, v17 +; VI-NEXT: v_mov_b32_e32 v17, s25 +; VI-NEXT: v_readfirstlane_b32 s41, v18 +; VI-NEXT: v_mov_b32_e32 v18, s26 ; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_readfirstlane_b32 s40, v19 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_readfirstlane_b32 s26, v17 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_readfirstlane_b32 s25, v18 +; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; VI-NEXT: v_writelane_b32 v30, s38, 6 -; VI-NEXT: v_readfirstlane_b32 s45, v0 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s43, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s41, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s24, v19 +; VI-NEXT: v_readfirstlane_b32 s23, v17 +; VI-NEXT: v_readfirstlane_b32 s22, v18 +; VI-NEXT: v_readfirstlane_b32 s21, v0 +; VI-NEXT: v_readfirstlane_b32 s20, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_readfirstlane_b32 s18, v3 +; VI-NEXT: v_readfirstlane_b32 s17, v4 +; VI-NEXT: v_readfirstlane_b32 s16, v5 ; VI-NEXT: v_readfirstlane_b32 s15, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 ; VI-NEXT: v_readfirstlane_b32 s13, v8 @@ -33828,9 +34366,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -33838,26 +34376,26 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true ; VI-NEXT: s_add_u32 s6, s6, 3 @@ -33870,29 +34408,29 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s12, s12, 0 ; VI-NEXT: s_add_u32 s15, s15, 3 ; VI-NEXT: s_addc_u32 s14, s14, 0 -; VI-NEXT: s_add_u32 s41, s41, 3 -; VI-NEXT: s_addc_u32 s40, s40, 0 -; VI-NEXT: s_add_u32 s43, s43, 3 -; VI-NEXT: s_addc_u32 s42, s42, 0 -; VI-NEXT: s_add_u32 s45, s45, 3 -; VI-NEXT: s_addc_u32 s44, s44, 0 -; VI-NEXT: s_add_u32 s28, s28, 3 -; VI-NEXT: s_addc_u32 s29, s29, 0 -; VI-NEXT: s_add_u32 s26, s26, 3 -; VI-NEXT: s_addc_u32 s27, s27, 0 -; VI-NEXT: s_add_u32 s24, s24, 3 -; VI-NEXT: s_addc_u32 s25, s25, 0 -; VI-NEXT: s_add_u32 s22, s22, 3 -; VI-NEXT: s_addc_u32 s23, s23, 0 -; VI-NEXT: s_add_u32 s20, s20, 3 -; VI-NEXT: s_addc_u32 s21, s21, 0 -; VI-NEXT: s_add_u32 s18, s18, 3 -; VI-NEXT: s_addc_u32 s19, s19, 0 -; VI-NEXT: s_add_u32 s16, s16, 3 -; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 -; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_add_u32 s17, s17, 3 +; VI-NEXT: s_addc_u32 s16, s16, 0 +; VI-NEXT: s_add_u32 s19, s19, 3 +; VI-NEXT: s_addc_u32 s18, s18, 0 +; VI-NEXT: s_add_u32 s21, s21, 3 +; VI-NEXT: s_addc_u32 s20, s20, 0 +; VI-NEXT: s_add_u32 s23, s23, 3 +; VI-NEXT: s_addc_u32 s22, s22, 0 +; VI-NEXT: s_add_u32 s25, s25, 3 +; VI-NEXT: s_addc_u32 s24, s24, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s26, s26, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s46, s46, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_add_u32 s56, s56, 3 +; VI-NEXT: s_addc_u32 s47, s47, 0 +; VI-NEXT: s_lshr_b32 s27, s7, 16 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 ; VI-NEXT: s_lshr_b32 s59, s11, 16 @@ -33900,137 +34438,137 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s61, s13, 16 ; VI-NEXT: s_lshr_b32 s62, s14, 16 ; VI-NEXT: s_lshr_b32 s63, s15, 16 -; VI-NEXT: s_lshr_b32 s72, s40, 16 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s74, s42, 16 -; VI-NEXT: s_lshr_b32 s75, s43, 16 -; VI-NEXT: s_lshr_b32 s76, s44, 16 -; VI-NEXT: s_lshr_b32 s77, s45, 16 -; VI-NEXT: s_lshr_b32 s78, s29, 16 -; VI-NEXT: s_lshr_b32 s79, s28, 16 -; VI-NEXT: s_lshr_b32 s88, s27, 16 -; VI-NEXT: s_lshr_b32 s89, s26, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 16 -; VI-NEXT: s_lshr_b32 s30, s23, 16 -; VI-NEXT: s_lshr_b32 s31, s22, 16 -; VI-NEXT: s_lshr_b32 s34, s21, 16 -; VI-NEXT: s_lshr_b32 s35, s20, 16 -; VI-NEXT: s_lshr_b32 s36, s19, 16 -; VI-NEXT: s_lshr_b32 s37, s18, 16 -; VI-NEXT: s_lshr_b32 s38, s17, 16 -; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_lshr_b32 s72, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 16 +; VI-NEXT: s_lshr_b32 s76, s20, 16 +; VI-NEXT: s_lshr_b32 s77, s21, 16 +; VI-NEXT: s_lshr_b32 s78, s22, 16 +; VI-NEXT: s_lshr_b32 s79, s23, 16 +; VI-NEXT: s_lshr_b32 s88, s24, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s41, 16 +; VI-NEXT: s_lshr_b32 s31, s42, 16 +; VI-NEXT: s_lshr_b32 s34, s43, 16 +; VI-NEXT: s_lshr_b32 s35, s44, 16 +; VI-NEXT: s_lshr_b32 s36, s45, 16 +; VI-NEXT: s_lshr_b32 s37, s46, 16 +; VI-NEXT: s_lshr_b32 s38, s47, 16 +; VI-NEXT: s_lshr_b32 s39, s56, 16 ; VI-NEXT: .LBB45_3: ; %end -; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_and_b32 s4, 0xffff, s56 ; VI-NEXT: s_lshl_b32 s5, s39, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, 0xffff, s17 -; VI-NEXT: s_lshl_b32 s16, s38, 16 -; VI-NEXT: s_or_b32 s5, s5, s16 -; VI-NEXT: s_and_b32 s16, 0xffff, s18 -; VI-NEXT: s_lshl_b32 s17, s37, 16 -; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: s_and_b32 s17, 0xffff, s19 -; VI-NEXT: s_lshl_b32 s18, s36, 16 -; VI-NEXT: s_or_b32 s17, s17, s18 -; VI-NEXT: s_and_b32 s18, 0xffff, s20 -; VI-NEXT: s_lshl_b32 s19, s35, 16 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, 0xffff, s21 -; VI-NEXT: s_lshl_b32 s20, s34, 16 -; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: s_and_b32 s20, 0xffff, s22 -; VI-NEXT: s_lshl_b32 s21, s31, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: s_and_b32 s21, 0xffff, s23 -; VI-NEXT: s_lshl_b32 s22, s30, 16 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s22, 0xffff, s24 -; VI-NEXT: s_lshl_b32 s23, s91, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: s_and_b32 s23, 0xffff, s25 -; VI-NEXT: s_lshl_b32 s24, s90, 16 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s24, 0xffff, s26 -; VI-NEXT: s_lshl_b32 s25, s89, 16 -; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: s_and_b32 s25, 0xffff, s27 -; VI-NEXT: s_lshl_b32 s26, s88, 16 -; VI-NEXT: s_or_b32 s25, s25, s26 -; VI-NEXT: s_and_b32 s26, 0xffff, s28 -; VI-NEXT: s_lshl_b32 s27, s79, 16 -; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, 0xffff, s29 -; VI-NEXT: s_lshl_b32 s28, s78, 16 -; VI-NEXT: s_or_b32 s27, s27, s28 -; VI-NEXT: s_and_b32 s28, 0xffff, s45 -; VI-NEXT: s_lshl_b32 s29, s77, 16 -; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, 0xffff, s44 -; VI-NEXT: s_lshl_b32 s44, s76, 16 -; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s5, 0xffff, s47 +; VI-NEXT: s_lshl_b32 s47, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s47 +; VI-NEXT: s_and_b32 s46, 0xffff, s46 +; VI-NEXT: s_lshl_b32 s47, s37, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s45, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s47, s36, 16 +; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_and_b32 s44, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s47, s35, 16 +; VI-NEXT: s_or_b32 s44, s44, s47 ; VI-NEXT: s_and_b32 s43, 0xffff, s43 -; VI-NEXT: s_lshl_b32 s44, s75, 16 -; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_lshl_b32 s47, s34, 16 +; VI-NEXT: s_or_b32 s43, s43, s47 ; VI-NEXT: s_and_b32 s42, 0xffff, s42 -; VI-NEXT: s_lshl_b32 s44, s74, 16 -; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_lshl_b32 s47, s31, 16 +; VI-NEXT: s_or_b32 s42, s42, s47 ; VI-NEXT: s_and_b32 s41, 0xffff, s41 -; VI-NEXT: s_lshl_b32 s44, s73, 16 -; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_lshl_b32 s47, s30, 16 +; VI-NEXT: s_or_b32 s41, s41, s47 ; VI-NEXT: s_and_b32 s40, 0xffff, s40 -; VI-NEXT: s_lshl_b32 s44, s72, 16 -; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_lshl_b32 s47, s91, 16 +; VI-NEXT: s_or_b32 s40, s40, s47 +; VI-NEXT: s_and_b32 s26, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s47, s90, 16 +; VI-NEXT: s_or_b32 s26, s26, s47 +; VI-NEXT: s_and_b32 s25, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s47, s89, 16 +; VI-NEXT: s_or_b32 s25, s25, s47 +; VI-NEXT: s_and_b32 s24, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s47, s88, 16 +; VI-NEXT: s_or_b32 s24, s24, s47 +; VI-NEXT: s_and_b32 s23, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s47, s79, 16 +; VI-NEXT: s_or_b32 s23, s23, s47 +; VI-NEXT: s_and_b32 s22, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s47, s78, 16 +; VI-NEXT: s_or_b32 s22, s22, s47 +; VI-NEXT: s_and_b32 s21, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s47, s77, 16 +; VI-NEXT: s_or_b32 s21, s21, s47 +; VI-NEXT: s_and_b32 s20, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s76, 16 +; VI-NEXT: s_or_b32 s20, s20, s47 +; VI-NEXT: s_and_b32 s19, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s47, s75, 16 +; VI-NEXT: s_or_b32 s19, s19, s47 +; VI-NEXT: s_and_b32 s18, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s47, s74, 16 +; VI-NEXT: s_or_b32 s18, s18, s47 +; VI-NEXT: s_and_b32 s17, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s47, s73, 16 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s47, s72, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 ; VI-NEXT: s_and_b32 s15, 0xffff, s15 -; VI-NEXT: s_lshl_b32 s44, s63, 16 -; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_lshl_b32 s47, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s47 ; VI-NEXT: s_and_b32 s14, 0xffff, s14 -; VI-NEXT: s_lshl_b32 s44, s62, 16 -; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_lshl_b32 s47, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s47 ; VI-NEXT: s_and_b32 s13, 0xffff, s13 -; VI-NEXT: s_lshl_b32 s44, s61, 16 -; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_lshl_b32 s47, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s47 ; VI-NEXT: s_and_b32 s12, 0xffff, s12 -; VI-NEXT: s_lshl_b32 s44, s60, 16 -; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_lshl_b32 s47, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s47 ; VI-NEXT: s_and_b32 s11, 0xffff, s11 -; VI-NEXT: s_lshl_b32 s44, s59, 16 -; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_lshl_b32 s47, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s47 ; VI-NEXT: s_and_b32 s10, 0xffff, s10 -; VI-NEXT: s_lshl_b32 s44, s58, 16 -; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_lshl_b32 s47, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s47 ; VI-NEXT: s_and_b32 s9, 0xffff, s9 -; VI-NEXT: s_lshl_b32 s44, s57, 16 -; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_lshl_b32 s47, s57, 16 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 -; VI-NEXT: s_lshl_b32 s44, s56, 16 -; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 -; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s9, s9, s47 +; VI-NEXT: s_or_b32 s8, s8, s29 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_or_b32 s7, s7, s27 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_mov_b32_e32 v3, s17 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s21 -; VI-NEXT: v_mov_b32_e32 v8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_mov_b32_e32 v11, s25 -; VI-NEXT: v_mov_b32_e32 v12, s26 -; VI-NEXT: v_mov_b32_e32 v13, s27 -; VI-NEXT: v_mov_b32_e32 v14, s28 -; VI-NEXT: v_mov_b32_e32 v15, s29 -; VI-NEXT: v_mov_b32_e32 v16, s43 -; VI-NEXT: v_mov_b32_e32 v17, s42 -; VI-NEXT: v_mov_b32_e32 v18, s41 -; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v2, s46 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s44 +; VI-NEXT: v_mov_b32_e32 v5, s43 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s41 +; VI-NEXT: v_mov_b32_e32 v8, s40 +; VI-NEXT: v_mov_b32_e32 v9, s26 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v20, s15 ; VI-NEXT: v_mov_b32_e32 v21, s14 ; VI-NEXT: v_mov_b32_e32 v22, s13 @@ -34082,9 +34620,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: s_branch .LBB45_2 ; ; GFX9-LABEL: bitcast_v15i64_to_v60f16_scalar: @@ -34093,20 +34631,48 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_readfirstlane_b32 s8, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_readfirstlane_b32 s9, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s22 +; GFX9-NEXT: v_readfirstlane_b32 s10, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s23 ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_readfirstlane_b32 s11, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s24 +; GFX9-NEXT: v_readfirstlane_b32 s12, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s25 +; GFX9-NEXT: v_readfirstlane_b32 s13, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s26 ; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s14, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_readfirstlane_b32 s15, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_readfirstlane_b32 s16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s17, v19 +; GFX9-NEXT: v_readfirstlane_b32 s18, v17 +; GFX9-NEXT: v_readfirstlane_b32 s19, v18 +; GFX9-NEXT: v_readfirstlane_b32 s20, v0 +; GFX9-NEXT: v_readfirstlane_b32 s21, v1 +; GFX9-NEXT: v_readfirstlane_b32 s22, v2 +; GFX9-NEXT: v_readfirstlane_b32 s23, v3 +; GFX9-NEXT: v_readfirstlane_b32 s24, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v5 +; GFX9-NEXT: v_readfirstlane_b32 s26, v6 +; GFX9-NEXT: v_readfirstlane_b32 s27, v7 +; GFX9-NEXT: v_readfirstlane_b32 s28, v8 +; GFX9-NEXT: v_readfirstlane_b32 s29, v9 ; GFX9-NEXT: v_readfirstlane_b32 s40, v10 ; GFX9-NEXT: v_readfirstlane_b32 s41, v11 ; GFX9-NEXT: v_readfirstlane_b32 s42, v12 @@ -34123,30 +34689,30 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s44, s44, 3 @@ -34155,16 +34721,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s43, s43, 0 ; GFX9-NEXT: s_add_u32 s40, s40, 3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -34179,61 +34735,71 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_lshr_b32 s46, s45, 16 ; GFX9-NEXT: s_lshr_b32 s47, s44, 16 ; GFX9-NEXT: s_lshr_b32 s56, s43, 16 ; GFX9-NEXT: s_lshr_b32 s57, s42, 16 ; GFX9-NEXT: s_lshr_b32 s58, s41, 16 ; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshr_b32 s78, s29, 16 -; GFX9-NEXT: s_lshr_b32 s79, s28, 16 -; GFX9-NEXT: s_lshr_b32 s88, s27, 16 -; GFX9-NEXT: s_lshr_b32 s89, s26, 16 -; GFX9-NEXT: s_lshr_b32 s90, s25, 16 -; GFX9-NEXT: s_lshr_b32 s91, s24, 16 -; GFX9-NEXT: s_lshr_b32 s92, s23, 16 -; GFX9-NEXT: s_lshr_b32 s93, s22, 16 -; GFX9-NEXT: s_lshr_b32 s94, s21, 16 -; GFX9-NEXT: s_lshr_b32 s95, s20, 16 -; GFX9-NEXT: s_lshr_b32 s30, s19, 16 -; GFX9-NEXT: s_lshr_b32 s31, s18, 16 -; GFX9-NEXT: s_lshr_b32 s34, s17, 16 -; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s28, 16 +; GFX9-NEXT: s_lshr_b32 s62, s27, 16 +; GFX9-NEXT: s_lshr_b32 s63, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s25, 16 +; GFX9-NEXT: s_lshr_b32 s73, s24, 16 +; GFX9-NEXT: s_lshr_b32 s74, s23, 16 +; GFX9-NEXT: s_lshr_b32 s75, s22, 16 +; GFX9-NEXT: s_lshr_b32 s76, s21, 16 +; GFX9-NEXT: s_lshr_b32 s77, s20, 16 +; GFX9-NEXT: s_lshr_b32 s78, s19, 16 +; GFX9-NEXT: s_lshr_b32 s79, s18, 16 +; GFX9-NEXT: s_lshr_b32 s88, s17, 16 +; GFX9-NEXT: s_lshr_b32 s89, s16, 16 +; GFX9-NEXT: s_lshr_b32 s90, s15, 16 +; GFX9-NEXT: s_lshr_b32 s91, s14, 16 +; GFX9-NEXT: s_lshr_b32 s92, s13, 16 +; GFX9-NEXT: s_lshr_b32 s93, s12, 16 +; GFX9-NEXT: s_lshr_b32 s94, s11, 16 +; GFX9-NEXT: s_lshr_b32 s95, s10, 16 +; GFX9-NEXT: s_lshr_b32 s30, s9, 16 +; GFX9-NEXT: s_lshr_b32 s31, s8, 16 +; GFX9-NEXT: s_lshr_b32 s34, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s6, 16 ; GFX9-NEXT: .LBB45_3: ; %end -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 -; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 -; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 -; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 -; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 -; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 -; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 -; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 -; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 -; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 -; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 -; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s7, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s9, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s10, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s11, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s12, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s13, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s14, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s15, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s16, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s17, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s60 ; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 ; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 ; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 @@ -34242,28 +34808,28 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: v_mov_b32_e32 v7, s21 -; GFX9-NEXT: v_mov_b32_e32 v8, s22 -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_mov_b32_e32 v11, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s26 -; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-NEXT: v_mov_b32_e32 v26, s40 @@ -34315,49 +34881,76 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-NEXT: v_dual_mov_b32 v17, s16 :: v_dual_mov_b32 v18, s17 +; GFX11-NEXT: v_dual_mov_b32 v19, s18 :: v_dual_mov_b32 v20, s19 +; GFX11-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v24, s23 +; GFX11-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v26, s25 +; GFX11-NEXT: v_dual_mov_b32 v27, s26 :: v_dual_mov_b32 v28, s27 +; GFX11-NEXT: v_dual_mov_b32 v29, s28 :: v_dual_mov_b32 v30, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: v_readfirstlane_b32 s8, v4 -; GFX11-NEXT: v_readfirstlane_b32 s9, v5 -; GFX11-NEXT: v_readfirstlane_b32 s10, v6 -; GFX11-NEXT: v_readfirstlane_b32 s11, v7 -; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 -; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-NEXT: v_readfirstlane_b32 s0, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v14 +; GFX11-NEXT: v_readfirstlane_b32 s2, v15 +; GFX11-NEXT: v_readfirstlane_b32 s3, v16 +; GFX11-NEXT: v_readfirstlane_b32 s4, v17 +; GFX11-NEXT: v_readfirstlane_b32 s5, v18 +; GFX11-NEXT: v_readfirstlane_b32 s6, v19 +; GFX11-NEXT: v_readfirstlane_b32 s7, v20 +; GFX11-NEXT: v_readfirstlane_b32 s8, v21 +; GFX11-NEXT: v_readfirstlane_b32 s9, v22 +; GFX11-NEXT: v_readfirstlane_b32 s10, v23 +; GFX11-NEXT: v_readfirstlane_b32 s11, v24 +; GFX11-NEXT: v_readfirstlane_b32 s12, v25 +; GFX11-NEXT: v_readfirstlane_b32 s13, v26 +; GFX11-NEXT: v_readfirstlane_b32 s14, v27 +; GFX11-NEXT: v_readfirstlane_b32 s15, v28 +; GFX11-NEXT: v_readfirstlane_b32 s16, v29 +; GFX11-NEXT: v_readfirstlane_b32 s17, v30 +; GFX11-NEXT: v_readfirstlane_b32 s18, v0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v1 +; GFX11-NEXT: v_readfirstlane_b32 s20, v2 +; GFX11-NEXT: v_readfirstlane_b32 s21, v3 +; GFX11-NEXT: v_readfirstlane_b32 s22, v4 +; GFX11-NEXT: v_readfirstlane_b32 s23, v5 +; GFX11-NEXT: v_readfirstlane_b32 s24, v6 +; GFX11-NEXT: v_readfirstlane_b32 s25, v7 +; GFX11-NEXT: v_readfirstlane_b32 s26, v8 +; GFX11-NEXT: v_readfirstlane_b32 s27, v9 +; GFX11-NEXT: v_readfirstlane_b32 s29, v10 +; GFX11-NEXT: v_readfirstlane_b32 s28, v11 ; GFX11-NEXT: s_mov_b32 s94, 0 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -34365,20 +34958,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true -; GFX11-NEXT: s_add_u32 s15, s15, 3 -; GFX11-NEXT: s_addc_u32 s14, s14, 0 -; GFX11-NEXT: s_add_u32 s12, s12, 3 -; GFX11-NEXT: s_addc_u32 s13, s13, 0 -; GFX11-NEXT: s_add_u32 s10, s10, 3 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 -; GFX11-NEXT: s_add_u32 s8, s8, 3 -; GFX11-NEXT: s_addc_u32 s9, s9, 0 -; GFX11-NEXT: s_add_u32 s6, s6, 3 -; GFX11-NEXT: s_addc_u32 s7, s7, 0 -; GFX11-NEXT: s_add_u32 s4, s4, 3 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s28, s28, 3 -; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s29, s29, 3 +; GFX11-NEXT: s_addc_u32 s28, s28, 0 ; GFX11-NEXT: s_add_u32 s26, s26, 3 ; GFX11-NEXT: s_addc_u32 s27, s27, 0 ; GFX11-NEXT: s_add_u32 s24, s24, 3 @@ -34391,36 +34972,48 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_addc_u32 s19, s19, 0 ; GFX11-NEXT: s_add_u32 s16, s16, 3 ; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s14, s14, 3 +; GFX11-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s10, s10, 3 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s8, s8, 3 +; GFX11-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-NEXT: s_add_u32 s6, s6, 3 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s4, s4, 3 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 -; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s43, s12, 16 -; GFX11-NEXT: s_lshr_b32 s44, s11, 16 -; GFX11-NEXT: s_lshr_b32 s45, s10, 16 -; GFX11-NEXT: s_lshr_b32 s46, s9, 16 -; GFX11-NEXT: s_lshr_b32 s47, s8, 16 -; GFX11-NEXT: s_lshr_b32 s56, s7, 16 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 -; GFX11-NEXT: s_lshr_b32 s58, s5, 16 -; GFX11-NEXT: s_lshr_b32 s59, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s29, 16 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: s_lshr_b32 s62, s27, 16 -; GFX11-NEXT: s_lshr_b32 s63, s26, 16 -; GFX11-NEXT: s_lshr_b32 s72, s25, 16 -; GFX11-NEXT: s_lshr_b32 s73, s24, 16 -; GFX11-NEXT: s_lshr_b32 s74, s23, 16 -; GFX11-NEXT: s_lshr_b32 s75, s22, 16 -; GFX11-NEXT: s_lshr_b32 s76, s21, 16 -; GFX11-NEXT: s_lshr_b32 s77, s20, 16 -; GFX11-NEXT: s_lshr_b32 s78, s19, 16 -; GFX11-NEXT: s_lshr_b32 s79, s18, 16 -; GFX11-NEXT: s_lshr_b32 s88, s17, 16 -; GFX11-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 16 +; GFX11-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-NEXT: s_lshr_b32 s62, s15, 16 +; GFX11-NEXT: s_lshr_b32 s63, s14, 16 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s73, s12, 16 +; GFX11-NEXT: s_lshr_b32 s74, s11, 16 +; GFX11-NEXT: s_lshr_b32 s75, s10, 16 +; GFX11-NEXT: s_lshr_b32 s76, s9, 16 +; GFX11-NEXT: s_lshr_b32 s77, s8, 16 +; GFX11-NEXT: s_lshr_b32 s78, s7, 16 +; GFX11-NEXT: s_lshr_b32 s79, s6, 16 +; GFX11-NEXT: s_lshr_b32 s88, s5, 16 +; GFX11-NEXT: s_lshr_b32 s89, s4, 16 ; GFX11-NEXT: s_lshr_b32 s90, s3, 16 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 @@ -34431,47 +35024,47 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s92 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s91 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s90 -; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s89 -; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s88 -; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s79 -; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s77 -; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s76 -; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s74 -; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s72 -; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s57 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s56 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s89 +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s88 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s6, s79 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s78 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s77 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s9, s76 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s75 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s74 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s62 +; GFX11-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s41 +; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 -; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v28, s29 :: v_dual_mov_b32 v29, s28 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -40290,7 +40883,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB50_4: ; %end @@ -43784,23 +44377,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s42, v1 -; SI-NEXT: v_readfirstlane_b32 s43, v2 -; SI-NEXT: v_readfirstlane_b32 s40, v3 -; SI-NEXT: v_readfirstlane_b32 s41, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s9, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_and_b64 s[44:45], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v21, s24 +; SI-NEXT: v_mov_b32_e32 v22, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -43819,438 +44410,565 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s44, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s44 -; SI-NEXT: s_lshr_b32 s44, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s44 -; SI-NEXT: s_lshr_b32 s44, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 -; SI-NEXT: s_lshr_b32 s44, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 -; SI-NEXT: s_lshr_b32 s44, s8, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 -; SI-NEXT: s_lshr_b32 s44, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s44 -; SI-NEXT: s_lshr_b32 s44, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 -; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 -; SI-NEXT: s_lshr_b32 s44, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s44 -; SI-NEXT: s_lshr_b32 s44, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s44 -; SI-NEXT: s_lshr_b32 s44, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s44 -; SI-NEXT: s_lshr_b32 s44, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 -; SI-NEXT: s_lshr_b32 s44, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s44 -; SI-NEXT: s_lshr_b32 s44, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s44 -; SI-NEXT: s_lshr_b32 s44, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s44 -; SI-NEXT: s_lshr_b32 s44, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s44 -; SI-NEXT: s_lshr_b32 s44, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 -; SI-NEXT: s_lshr_b32 s44, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s44 -; SI-NEXT: s_lshr_b32 s44, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s44 -; SI-NEXT: s_lshr_b32 s44, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s44 -; SI-NEXT: s_lshr_b32 s44, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s44 -; SI-NEXT: s_lshr_b32 s44, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 -; SI-NEXT: s_lshr_b32 s44, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 -; SI-NEXT: s_lshr_b32 s44, s16, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s11 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s10 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_add_f64 v[57:58], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_add_f64 v[41:42], s[20:21], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_add_f64 v[53:54], s[22:23], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 -; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 -; SI-NEXT: v_add_f64 v[49:50], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 -; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_f64 v[55:56], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[41:42], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v55, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v31, v16 +; SI-NEXT: v_mov_b32_e32 v62, v13 +; SI-NEXT: v_mov_b32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -44274,66 +44992,93 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index ab629e1a4d269..da908bc280e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -380,9 +380,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v11, v0, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 8 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -398,9 +398,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v11, v0, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 @@ -3114,7 +3114,7 @@ define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -3124,7 +3124,7 @@ define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3575,9 +3575,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v11, v0, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 8 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -3593,9 +3593,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v11, v0, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 @@ -6334,7 +6334,7 @@ define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -6344,7 +6344,7 @@ define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 12517c2bc1b5d..0a098eb6582c7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -14548,8 +14548,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -14579,8 +14579,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -14611,8 +14611,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -14645,8 +14645,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc @@ -14676,8 +14676,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo @@ -14708,8 +14708,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: s_waitcnt_depctr 0xfffd @@ -14740,8 +14740,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll index 146010c7146ef..a7ce8589dd6bf 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll @@ -50,8 +50,7 @@ define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x hal ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: [[COPY13:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[COPY13]], [[COPY]], killed [[REG_SEQUENCE2]], killed [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[REG_SEQUENCE3]], [[COPY]], killed [[REG_SEQUENCE2]], killed [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GCN-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index 8769270f5246d..da5e73199a223 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -391,27 +391,27 @@ entry: define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inreg %smp, <8 x i32> inreg %dst, i32 %x, i32 %y) { ; GFX9-LABEL: cluster_image_sample: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v9 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX9-NEXT: v_add_f32_e32 v9, 2.0, v9 -; GFX9-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v3 ; GFX9-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 2.0, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: image_sample_d v[8:11], v[8:13], s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf -; GFX9-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_add_f32_e32 v5, v11, v5 +; GFX9-NEXT: v_add_f32_e32 v4, v10, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v9, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v8, v2 ; GFX9-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf unorm ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 11facecaf6678..0c7dc74d95e49 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1280,45 +1280,25 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f64_test12: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc -; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f64_test12: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_f64_test12: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f64_test12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: fmul_select_f64_test12: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_f64_test12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_select_f64_test12: ; GFX10: ; %bb.0: @@ -1345,45 +1325,25 @@ define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f64_test13: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x40300000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc -; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f64_test13: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_f64_test13: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x40300000 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f64_test13: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: fmul_select_f64_test13: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40300000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_f64_test13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40300000 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fmul_select_f64_test13: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 676f11b99f922..dd9a013d37203 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -351,8 +351,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 @@ -385,8 +384,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1a1437a25a1fe..0c2d51dc9b0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12749,20 +12749,19 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12873,20 +12872,19 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13100,21 +13098,20 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13265,21 +13262,20 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 47161954cc332..9d49d5448aa68 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -9261,22 +9261,24 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3 -; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB145_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9288,22 +9290,24 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 -; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB145_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9315,22 +9319,24 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB145_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9404,22 +9410,24 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB146_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 8cf91aa900662..8b43e707da055 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -5140,11 +5140,11 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_64_1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3ff00000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40500000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x40500000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5204,11 +5204,11 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_1_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x40500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40500000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5268,11 +5268,11 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0xbff00000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0xc0500000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0xbff00000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0xc0500000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5332,11 +5332,11 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0xc0500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0xbff00000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0xc0500000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0xbff00000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5396,11 +5396,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_128_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x40500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40600000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40500000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x40600000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5463,11 +5463,11 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double % ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_128_4: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x40100000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40600000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40100000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x40600000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5527,10 +5527,10 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y) ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_2_4: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x40100000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40100000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v5, 2.0, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v6, 2.0, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5592,11 +5592,11 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double % ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_4_128: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x40600000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40100000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x40600000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v7, 0x40100000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 0c4a15f6a9d5e..15619532414ea 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -52,41 +52,41 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 -; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_add_u32_e32 v3, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; SDAG-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v12, v10, v1 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] -; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 -; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v4, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v10, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v11, v8, v5 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v5, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2] +; SDAG-NEXT: v_add3_u32 v6, v6, v12, v11 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, v[5:6] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: v_add3_u32 v4, v7, v6, v10 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -417,41 +417,41 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 -; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_add_u32_e32 v3, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; SDAG-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v12, v10, v1 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] -; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] -; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 -; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v4, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v10, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v11, v8, v5 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v5, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2] +; SDAG-NEXT: v_add3_u32 v6, v6, v12, v11 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, v[5:6] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: v_add3_u32 v4, v7, v6, v10 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index f2fe61f5376e4..8b53962714f68 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -244,9 +244,9 @@ declare void @func(ptr addrspace(5) nocapture) #0 ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: -; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: -; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; FLATSCR: scratch_store_dword v0, off, s33 offset: ; FLATSCR: scratch_store_dword v0, off, s33 offset: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 39fa342d2a66f..dec5d288cfb59 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -49,23 +49,22 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_ADD_F64_e64_]].sub0 ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:av_32 = COPY [[PHI1]].sub1 ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:av_32 = COPY [[PHI1]].sub0 - ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[REG_SEQUENCE2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY12]], %bb.1 + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY11]], %bb.1 ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec - ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index 93d7eeb085107..e59af2e82671e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -264,7 +264,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_dual_mov_b32 v0, 0xc8 :: v_dual_mov_b32 v1, 0 +; GCN-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0xc8 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 ; GCN-SDAG-NEXT: global_store_b128 v[2:3], v[6:9], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 1a45bd978ccc1..fb5e669d680f5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -10578,22 +10578,24 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 -; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB148_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index fdcb033f4f4d6..b1d382040addc 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -84,11 +84,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_ADD_U32_e64_3]], %subreg.sub0, killed [[V_ADD_U32_e64_2]], %subreg.sub1, killed [[V_ADD_U32_e64_1]], %subreg.sub2, killed [[V_ADD_U32_e64_]], %subreg.sub3 ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc @@ -107,11 +106,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_ADD_U32_e64_3]], %subreg.sub0, killed [[V_ADD_U32_e64_2]], %subreg.sub1, killed [[V_ADD_U32_e64_1]], %subreg.sub2, killed [[V_ADD_U32_e64_]], %subreg.sub3 ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc @@ -130,11 +128,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_ADD_U32_e64_3]], %subreg.sub0, killed [[V_ADD_U32_e64_2]], %subreg.sub1, killed [[V_ADD_U32_e64_1]], %subreg.sub2, killed [[V_ADD_U32_e64_]], %subreg.sub3 ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc @@ -153,11 +150,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_ADD_U32_e64_3]], %subreg.sub0, killed [[V_ADD_U32_e64_2]], %subreg.sub1, killed [[V_ADD_U32_e64_1]], %subreg.sub2, killed [[V_ADD_U32_e64_]], %subreg.sub3 ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = add <4 x i32> %a, %b store <4 x i32> %c, ptr poison @@ -488,15 +484,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr, ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY25]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF5]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY26]], killed [[COPY27]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct @@ -539,15 +534,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr, ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY25]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF5]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY26]], killed [[COPY27]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct @@ -590,15 +584,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr, ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY25]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF5]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY26]], killed [[COPY27]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct @@ -641,15 +634,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr, ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY25]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF5]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY26]], killed [[COPY27]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %p = extractvalue {ptr, i32, <4 x i32>} %a, 0 %i = extractvalue {ptr, i32, <4 x i32>} %a, 1 @@ -1233,16 +1225,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_7]], %subreg.sub0, killed [[V_PK_ADD_U16_6]], %subreg.sub1, killed [[V_PK_ADD_U16_5]], %subreg.sub2, killed [[V_PK_ADD_U16_4]], %subreg.sub3 ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_3]], %subreg.sub0, killed [[V_PK_ADD_U16_2]], %subreg.sub1, killed [[V_PK_ADD_U16_1]], %subreg.sub2, killed [[V_PK_ADD_U16_]], %subreg.sub3 ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF1]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY17]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 @@ -1273,16 +1263,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_7]], %subreg.sub0, killed [[V_PK_ADD_U16_6]], %subreg.sub1, killed [[V_PK_ADD_U16_5]], %subreg.sub2, killed [[V_PK_ADD_U16_4]], %subreg.sub3 ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_3]], %subreg.sub0, killed [[V_PK_ADD_U16_2]], %subreg.sub1, killed [[V_PK_ADD_U16_1]], %subreg.sub2, killed [[V_PK_ADD_U16_]], %subreg.sub3 ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF1]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY17]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 @@ -1313,16 +1301,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_7]], %subreg.sub0, killed [[V_PK_ADD_U16_6]], %subreg.sub1, killed [[V_PK_ADD_U16_5]], %subreg.sub2, killed [[V_PK_ADD_U16_4]], %subreg.sub3 ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_3]], %subreg.sub0, killed [[V_PK_ADD_U16_2]], %subreg.sub1, killed [[V_PK_ADD_U16_1]], %subreg.sub2, killed [[V_PK_ADD_U16_]], %subreg.sub3 ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF1]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY17]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 @@ -1353,16 +1339,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_7]], %subreg.sub0, killed [[V_PK_ADD_U16_6]], %subreg.sub1, killed [[V_PK_ADD_U16_5]], %subreg.sub2, killed [[V_PK_ADD_U16_4]], %subreg.sub3 ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_PK_ADD_U16_3]], %subreg.sub0, killed [[V_PK_ADD_U16_2]], %subreg.sub1, killed [[V_PK_ADD_U16_1]], %subreg.sub2, killed [[V_PK_ADD_U16_]], %subreg.sub3 ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF1]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY17]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = add <16 x i16> %a, %b store <16 x i16> %c, ptr poison @@ -1418,8 +1402,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs @@ -1436,8 +1419,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs @@ -1454,8 +1436,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs @@ -1472,8 +1453,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]] - ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = extractelement <36 x i32> %a, i32 35 store i32 %c, ptr addrspace(1) poison diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index 0d3f342f7735e..1caa1442fd2fd 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -42,13 +42,11 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: s_and_saveexec_b32 s0, s0 ; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; SDAG-NEXT: s_cbranch_execnz .LBB0_1 ; SDAG-NEXT: ; %bb.2: ; SDAG-NEXT: s_mov_b32 exec_lo, s3 -; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 0 -; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 1.0 ; SDAG-NEXT: s_mov_b32 s0, s12 ; SDAG-NEXT: s_mov_b32 s1, s12 ; SDAG-NEXT: s_mov_b32 s2, s12 @@ -58,19 +56,18 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: s_mov_b32 s6, s12 ; SDAG-NEXT: s_mov_b32 s7, s12 ; SDAG-NEXT: s_clause 0x2 -; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; SDAG-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; SDAG-NEXT: image_sample_c_lz v0, [v8, v8, v0, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; SDAG-NEXT: image_sample_c_lz v2, [v8, v8, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; SDAG-NEXT: image_sample_c_lz v1, [v8, v1, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: s_waitcnt vmcnt(2) -; SDAG-NEXT: v_add_f32_e32 v0, v9, v0 +; SDAG-NEXT: v_dual_add_f32 v0, v9, v0 :: v_dual_mov_b32 v9, v8 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_add_f32_e32 v0, v3, v0 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0 -; SDAG-NEXT: image_store v[0:2], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm +; SDAG-NEXT: v_dual_mul_f32 v7, 0x3e800000, v0 :: v_dual_mov_b32 v0, 0 +; SDAG-NEXT: image_store v[7:9], [v0, v0], s[0:7] dim:SQ_RSRC_IMG_2D unorm ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: issue92561: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll index d7037f17bc626..a6560f1d414f3 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll @@ -160,7 +160,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -184,7 +183,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -212,7 +211,6 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -236,7 +234,7 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -506,7 +504,6 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -530,7 +527,7 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll index 2daa8265db249..635fd1508c6b2 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll @@ -112,7 +112,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -136,7 +135,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -164,7 +163,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -188,7 +186,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -217,7 +215,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -241,7 +238,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -270,7 +267,6 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -294,7 +290,7 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -321,7 +317,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -342,7 +337,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -369,7 +364,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -390,7 +384,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -417,7 +411,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -441,7 +434,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -469,7 +462,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -493,7 +485,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -523,7 +515,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -547,7 +538,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -581,7 +572,6 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -605,7 +595,7 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll index 78cb38aff2f3e..282b816b05941 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll @@ -598,7 +598,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -622,7 +621,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -650,7 +649,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -674,7 +672,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -703,7 +701,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -727,7 +724,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -949,7 +946,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -973,7 +969,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1000,7 +996,6 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1024,7 +1019,7 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll index f5fd05983c9af..dcfd0ec455eee 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -20,7 +20,7 @@ define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -74,7 +74,7 @@ define <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -128,7 +128,7 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -186,7 +186,7 @@ define half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -240,7 +240,7 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll index 36cb9620b13c5..41cc21215fc19 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll @@ -20,7 +20,7 @@ define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -74,7 +74,7 @@ define <2 x float> @raw_ptr_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -131,7 +131,7 @@ define <3 x float> @raw_ptr_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -190,7 +190,7 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -252,7 +252,7 @@ define float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -306,7 +306,7 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll index 8c9217554ff76..2556c67d7235e 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll @@ -20,7 +20,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -75,7 +75,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -130,7 +130,7 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -185,7 +185,7 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -240,7 +240,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -295,7 +295,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -350,7 +350,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -405,7 +405,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -460,7 +460,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -515,7 +515,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -570,7 +570,7 @@ define <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -627,7 +627,7 @@ define <3 x float> @raw_ptr_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -686,7 +686,7 @@ define <4 x float> @raw_ptr_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -747,7 +747,7 @@ define half @raw_ptr_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -802,7 +802,7 @@ define <2 x half> @raw_ptr_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -856,7 +856,7 @@ define <4 x half> @raw_ptr_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -913,7 +913,7 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -969,7 +969,7 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1025,7 +1025,7 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zex ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1081,7 +1081,7 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sex ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1138,7 +1138,7 @@ define half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1194,7 +1194,7 @@ define float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1249,7 +1249,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1302,7 +1302,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1355,7 +1355,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1410,7 +1410,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1465,7 +1465,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1520,7 +1520,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1576,7 +1576,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1627,7 +1627,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1679,7 +1679,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1736,7 +1736,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1793,7 +1793,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1851,7 +1851,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1909,7 +1909,7 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll index 9c0247a31cf67..b111b5b4180fd 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -20,7 +20,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -73,7 +73,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -127,7 +127,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -182,9 +182,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -208,7 +207,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -240,9 +239,8 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -266,7 +264,7 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -295,7 +293,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -346,7 +344,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -398,7 +396,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -453,7 +451,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -508,7 +506,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -568,11 +566,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -596,7 +593,7 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll index 316089993e793..dac7c7a60b9b4 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -20,7 +20,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -73,7 +73,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -128,9 +128,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -154,7 +153,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -186,9 +185,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -212,7 +210,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -245,9 +243,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY13]], %subreg.sub0, killed [[COPY12]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -271,7 +268,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -304,9 +301,8 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY13]], %subreg.sub0, killed [[COPY12]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -330,7 +326,7 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -360,10 +356,9 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -384,7 +379,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -414,10 +409,9 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -438,7 +432,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -469,9 +463,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -495,7 +488,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -527,9 +520,8 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -553,7 +545,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -585,11 +577,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -613,7 +604,7 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -649,11 +640,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY13]], %subreg.sub0, killed [[COPY12]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -677,7 +667,7 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll index e8d3d34e6df63..266d7a38723c7 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll @@ -21,7 +21,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -76,7 +76,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -131,7 +131,7 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -186,7 +186,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -241,7 +241,7 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -295,7 +295,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -349,7 +349,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -403,7 +403,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -457,7 +457,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -511,7 +511,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -565,7 +565,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -619,7 +619,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -674,9 +674,8 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -700,7 +699,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -732,9 +731,8 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -758,7 +756,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -791,9 +789,8 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY13]], %subreg.sub0, killed [[COPY12]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -817,7 +814,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -847,7 +844,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -902,7 +899,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -957,7 +954,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1012,7 +1009,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1067,9 +1064,8 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1093,7 +1089,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1124,9 +1120,8 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1150,7 +1145,7 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1179,7 +1174,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1232,7 +1227,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1287,7 +1282,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1342,7 +1337,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1397,7 +1392,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1453,7 +1448,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1504,7 +1499,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1556,7 +1551,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1611,7 +1606,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -1666,7 +1661,7 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1724,7 +1719,7 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1781,7 +1776,7 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll index f35d1f1d8abd8..488ed243d31fe 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -19,7 +19,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr add ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -73,7 +73,7 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -127,7 +127,7 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -184,7 +184,7 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr add ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -238,7 +238,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -292,7 +292,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -346,7 +346,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -400,7 +400,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll index da07f158d16fc..ad25ae604368d 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll @@ -19,7 +19,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ad ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -73,7 +73,7 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -130,7 +130,7 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -189,7 +189,7 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -250,7 +250,7 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr ad ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -304,7 +304,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(pt ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -358,7 +358,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(pt ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -412,7 +412,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -466,7 +466,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(pt ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll index 7441461ce57c0..8718299e5a2d3 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -20,7 +20,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -74,7 +74,7 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -129,9 +129,8 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -155,7 +154,7 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -186,7 +185,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -241,7 +240,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -296,7 +295,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -350,7 +349,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -404,7 +403,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -458,7 +457,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -512,7 +511,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll index 59207c95961b9..e9bdf468182fd 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll @@ -21,7 +21,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -77,9 +77,8 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -103,7 +102,7 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -136,9 +135,8 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -162,7 +160,7 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -196,9 +194,8 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY13]], %subreg.sub0, killed [[COPY12]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -222,7 +219,7 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE3]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -253,7 +250,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -308,7 +305,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -363,7 +360,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -418,7 +415,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -473,7 +470,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -528,7 +525,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -583,7 +580,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -638,7 +635,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -691,7 +688,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -744,7 +741,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -797,7 +794,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -852,7 +849,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -907,7 +904,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -962,7 +959,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1018,7 +1015,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1069,7 +1066,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1121,7 +1118,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1178,7 +1175,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1235,7 +1232,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1293,7 +1290,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec @@ -1351,7 +1348,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll index 78241c4806ae4..18c2be3678718 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll @@ -113,7 +113,6 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -137,7 +136,7 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll index a11bc33976629..153ec58bc1b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll @@ -67,7 +67,6 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -91,7 +90,7 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -120,7 +119,6 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -144,7 +142,7 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -174,7 +172,6 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -198,7 +195,7 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll index 43836866592d8..8c223a98784f1 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll @@ -374,7 +374,6 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -388,7 +387,7 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -410,7 +409,6 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -424,7 +422,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -447,7 +445,6 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -461,7 +458,7 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -484,7 +481,6 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -498,7 +494,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -522,7 +518,6 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -536,7 +531,7 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -560,7 +555,6 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -574,7 +568,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -958,7 +952,6 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -972,7 +965,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -994,7 +987,6 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1008,7 +1000,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1031,7 +1023,6 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1045,7 +1036,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1068,7 +1059,6 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1082,7 +1072,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1106,7 +1096,6 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1120,7 +1109,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1144,7 +1133,6 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} @@ -1158,7 +1146,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) + ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 1e6aea593065c..750f72e775a95 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -159,100 +159,100 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v3, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v3 -; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 ; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 -; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] -; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index 83f0229aea326..3712b4782dede 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -39,9 +39,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -70,14 +68,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 5b6fc6ae2cb91..7a03a31acc634 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -310,11 +310,11 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 +; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 +; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 @@ -323,9 +323,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -474,8 +474,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 @@ -484,9 +484,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -497,10 +497,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 @@ -625,20 +625,20 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 +; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 +; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -786,17 +786,17 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -810,11 +810,11 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll index fa6d878ad7556..78417548e0384 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -392,10 +392,11 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i6 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], killed [[V_LSHLREV_B32_e64_]], implicit $exec ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, [[COPY]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY6]], killed [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY6]], killed [[COPY7]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -413,17 +414,17 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i6 ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28 ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec - ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 + ; CHECK45-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 - ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 7 ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; CHECK45-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_2]], [[COPY7]], implicit $exec + ; CHECK45-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_1]], [[COPY7]], implicit $exec ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12 - ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY9]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], killed [[COPY9]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub1 ; CHECK45-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY8]], killed [[COPY10]], killed [[COPY6]], implicit $exec ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -435,9 +436,9 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i6 ; CHECK45-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; CHECK45-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK45-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 25 - ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_4]], killed [[COPY17]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY17]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 ; CHECK45-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub1 ; CHECK45-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY16]], killed [[COPY18]], implicit $exec ; CHECK45-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 @@ -446,9 +447,9 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i6 ; CHECK45-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 ; CHECK45-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub1 ; CHECK45-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub0 - ; CHECK45-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY22]], %subreg.sub0, killed [[COPY21]], %subreg.sub1, killed [[COPY15]], %subreg.sub2, killed [[COPY14]], %subreg.sub3 - ; CHECK45-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[COPY23]], killed [[REG_SEQUENCE7]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[COPY22]], %subreg.sub0, killed [[COPY21]], %subreg.sub1, killed [[COPY15]], %subreg.sub2, killed [[COPY14]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY23:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE7]] + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[V_MOV_B32_e32_]], killed [[COPY23]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index ff3de9e05e897..bc72687e260e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -26,6 +26,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 ; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 @@ -47,6 +48,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -69,6 +71,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -91,6 +94,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -113,6 +117,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -125,11 +130,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: @@ -495,8 +495,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 @@ -520,6 +520,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 @@ -542,6 +543,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -564,6 +566,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -576,9 +579,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 19bbfe1cdd9bb..6d6b4fa02c567 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -627,23 +627,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v3, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v3 -; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[32:35], v3 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:48 ; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 -; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 ; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 ; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 ; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 @@ -652,50 +652,50 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 ; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 -; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152 -; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: s_nop 11 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[32:35] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 ; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -704,22 +704,22 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 ; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 ; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; GCN-NEXT: s_endpgm @@ -733,23 +733,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v3 offset:48 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 ; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:24688 ; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:24672 ; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:24656 @@ -758,50 +758,50 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:24608 ; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:24592 ; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v3 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v3 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v3 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v3 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v3 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v3 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v3 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v3 offset:49152 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v4 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v4 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v4 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v4 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v4 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v4 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v4 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: s_nop 11 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -810,22 +810,22 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -1202,57 +1202,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: v_mul_f32_e32 v9, s1, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 1.0 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[32:35], v1 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:48 +; GCN-NEXT: v_mov_b32_e32 v9, 1.0 ; GCN-NEXT: v_ldexp_f32 v4, v4, v5 ; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_rndne_f32_e32 v10, v9 +; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; GCN-NEXT: v_sub_f32_e32 v11, v9, v10 -; GCN-NEXT: v_fma_f32 v9, s1, v3, -v9 +; GCN-NEXT: v_rndne_f32_e32 v11, v10 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GCN-NEXT: v_fmac_f32_e32 v9, s1, v7 +; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 +; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_add_f32_e32 v9, v11, v9 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 +; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159] -; GCN-NEXT: v_exp_f32_e32 v4, v9 -; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 -; GCN-NEXT: v_ldexp_f32 v4, v4, v9 -; GCN-NEXT: v_mul_f32_e32 v9, s2, v3 -; GCN-NEXT: v_rndne_f32_e32 v10, v9 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63] +; GCN-NEXT: v_add_f32_e32 v4, v12, v10 +; GCN-NEXT: v_exp_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; GCN-NEXT: v_ldexp_f32 v4, v4, v10 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; GCN-NEXT: v_sub_f32_e32 v11, v9, v10 -; GCN-NEXT: v_fma_f32 v9, s2, v3, -v9 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; GCN-NEXT: v_fmac_f32_e32 v9, s2, v7 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_add_f32_e32 v9, v11, v9 +; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 +; GCN-NEXT: v_rndne_f32_e32 v11, v10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] +; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 +; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 +; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 +; GCN-NEXT: v_add_f32_e32 v4, v12, v4 +; GCN-NEXT: v_exp_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 ; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127] -; GCN-NEXT: v_exp_f32_e32 v4, v9 -; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10 ; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 ; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 ; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 @@ -1261,68 +1261,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: v_ldexp_f32 v1, v4, v9 -; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; GCN-NEXT: v_ldexp_f32 v1, v4, v10 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; GCN-NEXT: v_rndne_f32_e32 v9, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; GCN-NEXT: v_sub_f32_e32 v10, v4, v9 -; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 +; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 +; GCN-NEXT: v_rndne_f32_e32 v10, v4 ; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95] -; GCN-NEXT: v_add_f32_e32 v1, v10, v4 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] +; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 +; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 +; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 +; GCN-NEXT: v_add_f32_e32 v1, v1, v4 ; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v9 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 ; GCN-NEXT: v_ldexp_f32 v1, v1, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_rndne_f32_e32 v9, v4 +; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 ; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127] +; GCN-NEXT: v_rndne_f32_e32 v1, v4 +; GCN-NEXT: v_sub_f32_e32 v10, v4, v1 ; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7 -; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63] -; GCN-NEXT: v_sub_f32_e32 v1, v4, v9 -; GCN-NEXT: v_add_f32_e32 v1, v1, v3 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v9 -; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 -; GCN-NEXT: v_ldexp_f32 v1, v1, v3 +; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; GCN-NEXT: v_ldexp_f32 v1, v3, v1 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31] -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[32:35] ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1335,14 +1335,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 ; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -1351,22 +1351,22 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 ; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 ; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: @@ -1387,57 +1387,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s1, v3 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v12, 1.0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:48 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v9, s1, v3, -v9 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s1, v7 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 +; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:8304 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159] -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v9 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s2, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63] +; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v9, s2, v3, -v9 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s2, v7 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 ; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127] -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10 ; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 ; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 ; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 @@ -1446,68 +1446,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v9 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v9 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 ; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95] -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v10, v4 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v9 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v2 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 ; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127] +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1 ; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v2 offset:57424 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63] -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v9 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v3 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v9 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v3 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31] -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1520,14 +1520,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -1536,22 +1536,22 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 7d3b316915923..c98feeb96232d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -746,9 +746,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB6_7: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -792,9 +792,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB6_7: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -838,9 +838,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB6_7: ; GFX10-64-NEXT: s_mov_b64 exec, 0 @@ -1005,9 +1005,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB7_9: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -1068,9 +1068,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB7_9: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -1131,9 +1131,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB7_9: ; GFX10-64-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index f971080e02c5b..3280d7aa9ddfe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2375,8 +2375,8 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 -; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:4 ; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 ; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 ; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 @@ -2400,8 +2400,8 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:92 ; GFX950-NEXT: scratch_load_dword v31, off, s32 -; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:104 -; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 ; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse @@ -2409,10 +2409,10 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse ; GFX950-NEXT: s_waitcnt vmcnt(25) -; GFX950-NEXT: v_max_f64 v[58:59], v[0:1], v[32:33] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[32:33] -; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:112 -; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:108 +; GFX950-NEXT: v_max_f64 v[58:59], v[0:1], v[34:35] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[34:35] +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:108 ; GFX950-NEXT: s_waitcnt vmcnt(25) ; GFX950-NEXT: v_max_f64 v[60:61], v[2:3], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37] @@ -2486,16 +2486,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_cndmask_b32_e64 v22, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v23, v1, v2, vcc ; GFX950-NEXT: s_waitcnt vmcnt(6) -; GFX950-NEXT: v_max_f64 v[0:1], v[24:25], v[34:35] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[34:35] +; GFX950-NEXT: v_max_f64 v[0:1], v[24:25], v[32:33] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] ; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v24, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v25, v1, v2, vcc ; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse ; GFX950-NEXT: s_waitcnt vmcnt(4) -; GFX950-NEXT: v_max_f64 v[0:1], v[26:27], v[32:33] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[32:33] +; GFX950-NEXT: v_max_f64 v[0:1], v[26:27], v[34:35] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[34:35] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v26, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v27, v1, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index dfd67873c3b86..d07bd6c8dd902 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2375,8 +2375,8 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 -; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:4 ; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 ; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 ; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 @@ -2400,8 +2400,8 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:92 ; GFX950-NEXT: scratch_load_dword v31, off, s32 -; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:104 -; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 ; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse @@ -2409,10 +2409,10 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse ; GFX950-NEXT: s_waitcnt vmcnt(25) -; GFX950-NEXT: v_min_f64 v[58:59], v[0:1], v[32:33] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[32:33] -; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:112 -; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:108 +; GFX950-NEXT: v_min_f64 v[58:59], v[0:1], v[34:35] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[34:35] +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:108 ; GFX950-NEXT: s_waitcnt vmcnt(25) ; GFX950-NEXT: v_min_f64 v[60:61], v[2:3], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37] @@ -2486,16 +2486,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_cndmask_b32_e64 v22, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v23, v1, v2, vcc ; GFX950-NEXT: s_waitcnt vmcnt(6) -; GFX950-NEXT: v_min_f64 v[0:1], v[24:25], v[34:35] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[34:35] +; GFX950-NEXT: v_min_f64 v[0:1], v[24:25], v[32:33] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] ; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v24, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v25, v1, v2, vcc ; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse ; GFX950-NEXT: s_waitcnt vmcnt(4) -; GFX950-NEXT: v_min_f64 v[0:1], v[26:27], v[32:33] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[32:33] +; GFX950-NEXT: v_min_f64 v[0:1], v[26:27], v[34:35] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[34:35] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v26, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v27, v1, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index c41c8a0b4ce2e..06a0514404fbc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_agent_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_agent_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -788,7 +785,6 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -971,7 +967,6 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1150,7 +1145,6 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1329,7 +1323,6 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1511,7 +1504,6 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1693,7 +1685,6 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1872,7 +1863,6 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2053,7 +2043,6 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2235,7 +2224,6 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2419,7 +2407,6 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2657,7 +2644,6 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2894,7 +2880,6 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3134,7 +3119,6 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3407,7 +3391,6 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3677,7 +3660,6 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3949,7 +3931,6 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4222,7 +4203,6 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4497,7 +4477,6 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4772,7 +4751,6 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5044,7 +5022,6 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5316,7 +5293,6 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5591,7 +5567,6 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5866,7 +5841,6 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6141,7 +6115,6 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6416,7 +6389,6 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6691,7 +6663,6 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6966,7 +6937,6 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7241,7 +7211,6 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7544,7 +7513,6 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7846,7 +7814,6 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8149,7 +8116,6 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8454,7 +8420,6 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8760,7 +8725,6 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9066,7 +9030,6 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9369,7 +9332,6 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9672,7 +9634,6 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9978,7 +9939,6 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10284,7 +10244,6 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10590,7 +10549,6 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10896,7 +10854,6 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11202,7 +11159,6 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11508,7 +11464,6 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11814,7 +11769,6 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -12025,7 +11979,6 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12225,7 +12178,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12425,7 +12377,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12627,7 +12578,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12811,7 +12761,6 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12990,7 +12939,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13169,7 +13117,6 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13351,7 +13298,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13533,7 +13479,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13712,7 +13657,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13893,7 +13837,6 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14075,7 +14018,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14259,7 +14201,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14497,7 +14438,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14735,7 +14675,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14976,7 +14915,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -15250,7 +15188,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15520,7 +15457,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15792,7 +15728,6 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16065,7 +16000,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16340,7 +16274,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16615,7 +16548,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16887,7 +16819,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17159,7 +17090,6 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17434,7 +17364,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17709,7 +17638,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17984,7 +17912,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18259,7 +18186,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18534,7 +18460,6 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18809,7 +18734,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19084,7 +19008,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19387,7 +19310,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19689,7 +19611,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19993,7 +19914,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20300,7 +20220,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20607,7 +20526,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20911,7 +20829,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21215,7 +21132,6 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21522,7 +21438,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21829,7 +21744,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22136,7 +22050,6 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22443,7 +22356,6 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22750,7 +22662,6 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23057,7 +22968,6 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23364,7 +23274,6 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll index 2af195461d2eb..8fcaeccbc0397 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -788,7 +785,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -971,7 +967,6 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1150,7 +1145,6 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1329,7 +1323,6 @@ define amdgpu_kernel void @private_cluster_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1510,7 +1503,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1691,7 +1683,6 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1870,7 +1861,6 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2051,7 +2041,6 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2232,7 +2221,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2415,7 +2403,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2652,7 +2639,6 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2889,7 +2875,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3128,7 +3113,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3400,7 +3384,6 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3670,7 +3653,6 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3942,7 +3924,6 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4214,7 +4195,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4488,7 +4468,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4762,7 +4741,6 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5034,7 +5012,6 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5306,7 +5283,6 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5580,7 +5556,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5854,7 +5829,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6128,7 +6102,6 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6402,7 +6375,6 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6676,7 +6648,6 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6950,7 +6921,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7224,7 +7194,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7526,7 +7495,6 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7828,7 +7796,6 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8131,7 +8098,6 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8435,7 +8401,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8740,7 +8705,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9045,7 +9009,6 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9348,7 +9311,6 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9651,7 +9613,6 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9956,7 +9917,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10261,7 +10221,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10566,7 +10525,6 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10871,7 +10829,6 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11176,7 +11133,6 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11481,7 +11437,6 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11786,7 +11741,6 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11996,7 +11950,6 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12196,7 +12149,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12396,7 +12348,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12598,7 +12549,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12782,7 +12732,6 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12961,7 +12910,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13140,7 +13088,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13321,7 +13268,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13502,7 +13448,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13681,7 +13626,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13862,7 +13806,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14043,7 +13986,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14226,7 +14168,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14463,7 +14404,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14701,7 +14641,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14941,7 +14880,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -15214,7 +15152,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15484,7 +15421,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15756,7 +15692,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16028,7 +15963,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16302,7 +16236,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16576,7 +16509,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16848,7 +16780,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17120,7 +17051,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17394,7 +17324,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17668,7 +17597,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17942,7 +17870,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18216,7 +18143,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18490,7 +18416,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18764,7 +18689,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19038,7 +18962,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19340,7 +19263,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19642,7 +19564,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19946,7 +19867,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20252,7 +20172,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20558,7 +20477,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20862,7 +20780,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21166,7 +21083,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21472,7 +21388,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21778,7 +21693,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22084,7 +21998,6 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22390,7 +22303,6 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22696,7 +22608,6 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23002,7 +22913,6 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23308,7 +23218,6 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll index 17ed7b7776aa9..f9189c8419898 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -787,7 +784,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -967,7 +963,6 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1146,7 +1141,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1325,7 +1319,6 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1504,7 +1497,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1683,7 +1675,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1862,7 +1853,6 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2041,7 +2031,6 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2220,7 +2209,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2399,7 +2387,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2632,7 +2619,6 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2868,7 +2854,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3104,7 +3089,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3373,7 +3357,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3643,7 +3626,6 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3913,7 +3895,6 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4183,7 +4164,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4453,7 +4433,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4723,7 +4702,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4993,7 +4971,6 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5263,7 +5240,6 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5533,7 +5509,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5803,7 +5778,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6073,7 +6047,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6343,7 +6316,6 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6613,7 +6585,6 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6883,7 +6854,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7153,7 +7123,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7451,7 +7420,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7753,7 +7721,6 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8055,7 +8022,6 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8357,7 +8323,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8659,7 +8624,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8961,7 +8925,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9263,7 +9226,6 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9565,7 +9527,6 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9867,7 +9828,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10169,7 +10129,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10471,7 +10430,6 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10773,7 +10731,6 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11075,7 +11032,6 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11377,7 +11333,6 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11679,7 +11634,6 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11886,7 +11840,6 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12086,7 +12039,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12286,7 +12238,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12486,7 +12437,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12666,7 +12616,6 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12845,7 +12794,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13024,7 +12972,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13203,7 +13150,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13382,7 +13328,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13561,7 +13506,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13740,7 +13684,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13919,7 +13862,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14098,7 +14040,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14331,7 +14272,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14567,7 +14507,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14803,7 +14742,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -15072,7 +15010,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15342,7 +15279,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15612,7 +15548,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15882,7 +15817,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16152,7 +16086,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16422,7 +16355,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16692,7 +16624,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16962,7 +16893,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17232,7 +17162,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17502,7 +17431,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17772,7 +17700,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18042,7 +17969,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18312,7 +18238,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18582,7 +18507,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18852,7 +18776,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19150,7 +19073,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19452,7 +19374,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19754,7 +19675,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20056,7 +19976,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20358,7 +20277,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20660,7 +20578,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20962,7 +20879,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21264,7 +21180,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21566,7 +21481,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21868,7 +21782,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22170,7 +22083,6 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22472,7 +22384,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22774,7 +22685,6 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23076,7 +22986,6 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23378,7 +23287,6 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll index 3702f983dbbb4..e4708f544d721 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_system_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_system_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_system_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -788,7 +785,6 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -971,7 +967,6 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1150,7 +1145,6 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1329,7 +1323,6 @@ define amdgpu_kernel void @private_system_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1511,7 +1504,6 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1693,7 +1685,6 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1872,7 +1863,6 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2053,7 +2043,6 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2235,7 +2224,6 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2419,7 +2407,6 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2657,7 +2644,6 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2894,7 +2880,6 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3134,7 +3119,6 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3407,7 +3391,6 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3677,7 +3660,6 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3949,7 +3931,6 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4222,7 +4203,6 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4497,7 +4477,6 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4772,7 +4751,6 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5044,7 +5022,6 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5316,7 +5293,6 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5591,7 +5567,6 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5866,7 +5841,6 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6141,7 +6115,6 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6444,7 +6417,6 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -6746,7 +6718,6 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7049,7 +7020,6 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7355,7 +7325,6 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7661,7 +7630,6 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7964,7 +7932,6 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8267,7 +8234,6 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8573,7 +8539,6 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8879,7 +8844,6 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9185,7 +9149,6 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9491,7 +9454,6 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9797,7 +9759,6 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10103,7 +10064,6 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10409,7 +10369,6 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10620,7 +10579,6 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -10820,7 +10778,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11020,7 +10977,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11222,7 +11178,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11406,7 +11361,6 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11585,7 +11539,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11764,7 +11717,6 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -11946,7 +11898,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12128,7 +12079,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12307,7 +12257,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12488,7 +12437,6 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12670,7 +12618,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12854,7 +12801,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13092,7 +13038,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -13330,7 +13275,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -13571,7 +13515,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -13845,7 +13788,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14115,7 +14057,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14387,7 +14328,6 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14660,7 +14600,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14935,7 +14874,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15210,7 +15148,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15482,7 +15419,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15754,7 +15690,6 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16029,7 +15964,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16304,7 +16238,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16579,7 +16512,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16854,7 +16786,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17129,7 +17060,6 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17404,7 +17334,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17679,7 +17608,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17982,7 +17910,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -18284,7 +18211,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -18588,7 +18514,6 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -18893,7 +18818,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19200,7 +19124,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19507,7 +19430,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19811,7 +19733,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20115,7 +20036,6 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20422,7 +20342,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20729,7 +20648,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21036,7 +20954,6 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21343,7 +21260,6 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21650,7 +21566,6 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21957,7 +21872,6 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22264,7 +22178,6 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll index b868d8a86de4b..d4c562a149b9a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -787,7 +784,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -967,7 +963,6 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1146,7 +1141,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1325,7 +1319,6 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1504,7 +1497,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1683,7 +1675,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1862,7 +1853,6 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2041,7 +2031,6 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2220,7 +2209,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2399,7 +2387,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2632,7 +2619,6 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2868,7 +2854,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3104,7 +3089,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3373,7 +3357,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3643,7 +3626,6 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3913,7 +3895,6 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4183,7 +4164,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4453,7 +4433,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4723,7 +4702,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4993,7 +4971,6 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5263,7 +5240,6 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5533,7 +5509,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5803,7 +5778,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6073,7 +6047,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6343,7 +6316,6 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6613,7 +6585,6 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6883,7 +6854,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7153,7 +7123,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7451,7 +7420,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7753,7 +7721,6 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8055,7 +8022,6 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8357,7 +8323,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8659,7 +8624,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8961,7 +8925,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9263,7 +9226,6 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9565,7 +9527,6 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9867,7 +9828,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10169,7 +10129,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10471,7 +10430,6 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10773,7 +10731,6 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11075,7 +11032,6 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11377,7 +11333,6 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11679,7 +11634,6 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11886,7 +11840,6 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12086,7 +12039,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12286,7 +12238,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12486,7 +12437,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12666,7 +12616,6 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12845,7 +12794,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13024,7 +12972,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13203,7 +13150,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13382,7 +13328,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13561,7 +13506,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13740,7 +13684,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13919,7 +13862,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14098,7 +14040,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14331,7 +14272,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14567,7 +14507,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14803,7 +14742,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -15072,7 +15010,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15342,7 +15279,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15612,7 +15548,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15882,7 +15817,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16152,7 +16086,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16422,7 +16355,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16692,7 +16624,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16962,7 +16893,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17232,7 +17162,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17502,7 +17431,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17772,7 +17700,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18042,7 +17969,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18312,7 +18238,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18582,7 +18507,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18852,7 +18776,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19150,7 +19073,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19452,7 +19374,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19754,7 +19675,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20056,7 +19976,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20358,7 +20277,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20660,7 +20578,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20962,7 +20879,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21264,7 +21180,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21566,7 +21481,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21868,7 +21782,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22170,7 +22083,6 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22472,7 +22384,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22774,7 +22685,6 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23076,7 +22986,6 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23378,7 +23287,6 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll index 08388d1b95aef..53a8a0a1b694e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -387,7 +386,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -587,7 +585,6 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -787,7 +784,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -969,7 +965,6 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1148,7 +1143,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1327,7 +1321,6 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1508,7 +1501,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1689,7 +1681,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -1868,7 +1859,6 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2048,7 +2038,6 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2229,7 +2218,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2411,7 +2399,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -2647,7 +2634,6 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -2883,7 +2869,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3121,7 +3106,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -3392,7 +3376,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3662,7 +3645,6 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -3933,7 +3915,6 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4205,7 +4186,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4478,7 +4458,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -4751,7 +4730,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5022,7 +5000,6 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5293,7 +5270,6 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5566,7 +5542,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -5839,7 +5814,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6112,7 +6086,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6385,7 +6358,6 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6658,7 +6630,6 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -6931,7 +6902,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7204,7 +7174,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -7505,7 +7474,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -7807,7 +7775,6 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8109,7 +8076,6 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8413,7 +8379,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -8717,7 +8682,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9021,7 +8985,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9323,7 +9286,6 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9625,7 +9587,6 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -9929,7 +9890,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10233,7 +10193,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10537,7 +10496,6 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -10841,7 +10799,6 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11145,7 +11102,6 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11449,7 +11405,6 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11753,7 +11708,6 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -11962,7 +11916,6 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12162,7 +12115,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12362,7 +12314,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12562,7 +12513,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12744,7 +12694,6 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -12923,7 +12872,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13102,7 +13050,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13283,7 +13230,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13464,7 +13410,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13643,7 +13588,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -13823,7 +13767,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14004,7 +13947,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14186,7 +14128,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -14422,7 +14363,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14658,7 +14598,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -14896,7 +14835,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] @@ -15167,7 +15105,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15437,7 +15374,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15708,7 +15644,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -15980,7 +15915,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16253,7 +16187,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16526,7 +16459,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -16797,7 +16729,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17068,7 +16999,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17341,7 +17271,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17614,7 +17543,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -17887,7 +17815,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18160,7 +18087,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18433,7 +18359,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18706,7 +18631,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -18979,7 +18903,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s2, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s3 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] @@ -19280,7 +19203,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19582,7 +19504,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -19884,7 +19805,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20188,7 +20108,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20492,7 +20411,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -20796,7 +20714,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21098,7 +21015,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21400,7 +21316,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -21704,7 +21619,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22008,7 +21922,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22312,7 +22225,6 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22616,7 +22528,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -22920,7 +22831,6 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23224,7 +23134,6 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] @@ -23528,7 +23437,6 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_mov_b32 s3, 20 ; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo ; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 331a29b3f4a93..85bf05f39c684 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -101,41 +101,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -160,41 +161,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -332,42 +334,42 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42f60000 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -392,42 +394,42 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x42f60000 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -559,41 +561,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -618,41 +621,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -821,77 +825,77 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x43190000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x43180000 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x43170000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x43160000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43150000 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43140000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43130000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43120000 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43110000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43100000 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x430f0000 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x430e0000 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x430d0000 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x430c0000 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430b0000 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430a0000 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0x43090000 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0x43080000 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0x43070000 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0x43060000 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43050000 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43040000 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43030000 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43020000 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43010000 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43000000 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x42fe0000 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x42fc0000 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x42fa0000 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x42f80000 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 @@ -912,77 +916,77 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x43190000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x43180000 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x43170000 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x43160000 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x43150000 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x43140000 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x43130000 +; GFX942-NEXT: v_mov_b32_e32 v8, 0x43120000 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x43110000 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x43100000 +; GFX942-NEXT: v_mov_b32_e32 v11, 0x430f0000 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x430e0000 +; GFX942-NEXT: v_mov_b32_e32 v13, 0x430d0000 +; GFX942-NEXT: v_mov_b32_e32 v14, 0x430c0000 +; GFX942-NEXT: v_mov_b32_e32 v15, 0x430b0000 +; GFX942-NEXT: v_mov_b32_e32 v16, 0x430a0000 +; GFX942-NEXT: v_mov_b32_e32 v17, 0x43090000 +; GFX942-NEXT: v_mov_b32_e32 v18, 0x43080000 +; GFX942-NEXT: v_mov_b32_e32 v19, 0x43070000 +; GFX942-NEXT: v_mov_b32_e32 v20, 0x43060000 +; GFX942-NEXT: v_mov_b32_e32 v21, 0x43050000 +; GFX942-NEXT: v_mov_b32_e32 v22, 0x43040000 +; GFX942-NEXT: v_mov_b32_e32 v23, 0x43030000 +; GFX942-NEXT: v_mov_b32_e32 v24, 0x43020000 +; GFX942-NEXT: v_mov_b32_e32 v25, 0x43010000 +; GFX942-NEXT: v_mov_b32_e32 v26, 0x43000000 +; GFX942-NEXT: v_mov_b32_e32 v27, 0x42fe0000 +; GFX942-NEXT: v_mov_b32_e32 v28, 0x42fc0000 +; GFX942-NEXT: v_mov_b32_e32 v29, 0x42fa0000 +; GFX942-NEXT: v_mov_b32_e32 v30, 0x42f80000 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v31 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v1 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1110,42 +1114,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1170,42 +1174,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1378,42 +1382,41 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1440,42 +1443,41 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s1 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1644,44 +1646,44 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v3 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1707,44 +1709,44 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v3 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -2094,152 +2096,57 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 +; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v32 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a33, a31 -; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a30 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a29 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a28 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a27 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a26 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a25 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a24 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a23 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a22 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a21 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a20 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a19 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a18 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a17 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a16 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a15 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a14 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a13 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a12 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a11 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a10 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a9 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a8 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a7 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a6 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a5 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a4 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a3 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[2:33], v0, v1, a[2:33] -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a31 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v32, a33 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_store_dwordx4 v0, a[30:33], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[26:29], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[22:25], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[18:21], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] +; GFX90A-NEXT: s_nop 12 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_agpr_init: @@ -2251,152 +2158,57 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 +; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v32 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v31 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_mov_b32 a33, a31 -; GFX942-NEXT: v_accvgpr_mov_b32 a32, a30 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a29 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a28 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a27 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a26 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a25 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a24 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a23 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a22 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a21 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a20 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a19 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a18 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a17 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a16 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a15 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a14 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a13 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a12 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a11 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a10 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a9 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a8 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a7 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a6 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a5 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a4 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a3 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a2 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[2:33], v0, v1, a[2:33] -; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a31 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a32 -; GFX942-NEXT: v_accvgpr_read_b32 v32, a33 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_store_dwordx4 v0, a[30:33], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[26:29], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[22:25], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[18:21], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] +; GFX942-NEXT: s_nop 11 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -2803,41 +2615,42 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -2884,41 +2697,42 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -3061,41 +2875,42 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s4, 16 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -3142,41 +2957,42 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index f46116ef752c9..122d69c20c49e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -499,105 +499,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 -; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 -; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 -; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 -; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 -; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 -; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 -; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 -; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 -; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 -; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 -; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 -; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 -; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 -; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 -; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 -; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 +; FAST90A-NEXT: v_accvgpr_write_b32 a0, s36 +; FAST90A-NEXT: v_accvgpr_write_b32 a1, s37 +; FAST90A-NEXT: v_accvgpr_write_b32 a2, s38 +; FAST90A-NEXT: v_accvgpr_write_b32 a3, s39 +; FAST90A-NEXT: v_accvgpr_write_b32 a4, s40 +; FAST90A-NEXT: v_accvgpr_write_b32 a5, s41 +; FAST90A-NEXT: v_accvgpr_write_b32 a6, s42 +; FAST90A-NEXT: v_accvgpr_write_b32 a7, s43 +; FAST90A-NEXT: v_accvgpr_write_b32 a8, s44 +; FAST90A-NEXT: v_accvgpr_write_b32 a9, s45 +; FAST90A-NEXT: v_accvgpr_write_b32 a10, s46 +; FAST90A-NEXT: v_accvgpr_write_b32 a11, s47 +; FAST90A-NEXT: v_accvgpr_write_b32 a12, s48 +; FAST90A-NEXT: v_accvgpr_write_b32 a13, s49 +; FAST90A-NEXT: v_accvgpr_write_b32 a14, s50 +; FAST90A-NEXT: v_accvgpr_write_b32 a15, s51 +; FAST90A-NEXT: v_accvgpr_write_b32 a16, s4 +; FAST90A-NEXT: v_accvgpr_write_b32 a17, s5 +; FAST90A-NEXT: v_accvgpr_write_b32 a18, s6 +; FAST90A-NEXT: v_accvgpr_write_b32 a19, s7 +; FAST90A-NEXT: v_accvgpr_write_b32 a20, s8 +; FAST90A-NEXT: v_accvgpr_write_b32 a21, s9 +; FAST90A-NEXT: v_accvgpr_write_b32 a22, s10 +; FAST90A-NEXT: v_accvgpr_write_b32 a23, s11 +; FAST90A-NEXT: v_accvgpr_write_b32 a24, s12 +; FAST90A-NEXT: v_accvgpr_write_b32 a25, s13 +; FAST90A-NEXT: v_accvgpr_write_b32 a26, s14 +; FAST90A-NEXT: v_accvgpr_write_b32 a27, s15 +; FAST90A-NEXT: v_accvgpr_write_b32 a28, s16 +; FAST90A-NEXT: v_accvgpr_write_b32 a29, s17 +; FAST90A-NEXT: v_accvgpr_write_b32 a30, s18 +; FAST90A-NEXT: v_accvgpr_write_b32 a31, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 -; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 -; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 -; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 -; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 -; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 -; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 -; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 -; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 -; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 -; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 -; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 -; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 -; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 -; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 -; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 -; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 -; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 -; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 -; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 -; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 -; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 -; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 -; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 -; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 -; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 -; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 -; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 -; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 -; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 -; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 -; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 -; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 -; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 -; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 -; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 -; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 -; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 -; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 -; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 -; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 -; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 -; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 -; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 -; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 -; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 -; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 -; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 +; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32 +; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33 +; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34 +; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35 +; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36 +; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37 +; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38 +; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39 +; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40 +; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41 +; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42 +; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43 +; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44 +; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45 +; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46 +; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47 +; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48 +; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49 +; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50 +; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51 +; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52 +; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53 +; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54 +; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55 +; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56 +; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57 +; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58 +; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59 +; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60 +; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 15 @@ -709,39 +677,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 -; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY90A-NEXT: s_nop 9 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; GREEDY90A-NEXT: s_nop 10 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a16 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a17 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a18 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a19 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a20 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a21 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a22 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a23 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a24 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a25 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a26 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a27 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a28 +; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-NEXT: s_nop 9 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: @@ -749,39 +729,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 -; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] -; GREEDY942-NEXT: s_nop 8 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 +; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 +; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 +; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s4 +; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s5 +; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s6 +; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s7 +; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s8 +; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s9 +; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s10 +; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s11 +; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s12 +; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s13 +; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s14 +; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v0, v1, a[0:15] ; GREEDY942-NEXT: s_nop 9 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a17 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a18 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a19 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a20 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a21 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a22 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a23 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a24 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a25 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a26 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a27 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a28 +; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a29 +; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY942-NEXT: s_nop 8 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: @@ -839,9 +831,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 -; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) @@ -862,8 +853,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 ; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 @@ -880,8 +871,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: s_nop 10 +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_nop 9 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll index f66b575379bae..7cf91eb63c837 100644 --- a/llvm/test/CodeGen/AMDGPU/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/mmra.ll @@ -12,12 +12,12 @@ define void @fence_loads(ptr %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4) + ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 + ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2 - ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2 + ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4) ; CHECK-NEXT: SI_RETURN fence release, !mmra !0 %ld = load atomic i8, ptr %ptr acquire, align 4, !mmra !2 @@ -33,8 +33,8 @@ define void @atomicrmw_acq(ptr %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 + ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr) ; CHECK-NEXT: SI_RETURN %old.2 = atomicrmw add ptr %ptr, i8 0 acquire, !mmra !2 ret void @@ -53,7 +53,7 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e64_]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], [[S_MOV_B32_1]], implicit $exec @@ -61,8 +61,7 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !2 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: @@ -72,10 +71,9 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %6, %bb.1 ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_NOT_B32_e32_]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr) - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2 ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 @@ -101,7 +99,7 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e64_]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], [[S_MOV_B32_1]], implicit $exec @@ -111,8 +109,7 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 killed [[V_LSHLREV_B32_e64_1]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[FLAT_LOAD_DWORD]], [[V_NOT_B32_e32_]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF @@ -124,9 +121,8 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %13, %bb.3 ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_AND_B32_e64_2]], %bb.0, %11, %bb.3 ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI2]], [[V_LSHLREV_B32_e64_2]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr) ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI2]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -149,8 +145,8 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[S_OR_B64_]], %bb.1, [[S_OR_B64_1]], %bb.2 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[V_AND_B32_e64_3]], %bb.2 ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[PHI3]] - ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI1]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[PHI3]] + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY5]], [[PHI1]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} @@ -220,7 +216,7 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) { ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY29]], killed [[S_MOV_B32_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY11]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e64_]], %subreg.sub0, killed [[COPY11]], %subreg.sub1 ; CHECK-NEXT: [[COPY30:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY29]], [[S_MOV_B32_1]], implicit $exec @@ -228,8 +224,7 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec - ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !0 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: @@ -239,10 +234,9 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) { ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %6, %bb.1 ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_NOT_B32_e32_]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !0 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr) - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0 ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index cf0fbe4506d20..f9041ffcf0a4f 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -484,66 +484,69 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v12, v9 +; W64-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v12, v8 +; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v12, v3 ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v8, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v2, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v5 -; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_mov_b32_e32 v15, v4 +; W64-O0-NEXT: v_mov_b32_e32 v13, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v14, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec ; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_mov_b32_e32 v16, v3 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v3 +; W64-O0-NEXT: s_waitcnt vmcnt(1) +; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_mov_b32_e32 v7, v1 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 -; W64-O0-NEXT: v_mov_b32_e32 v5, v6 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v14 +; W64-O0-NEXT: v_mov_b32_e32 v2, v13 +; W64-O0-NEXT: v_mov_b32_e32 v3, v12 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 -; W64-O0-NEXT: s_waitcnt vmcnt(9) -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v12, v9 +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v10 -; W64-O0-NEXT: s_waitcnt vmcnt(10) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v9, v10 +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane -; W64-O0-NEXT: v_writelane_b32 v17, s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr15 : SGPR spill to VGPR lane +; W64-O0-NEXT: v_writelane_b32 v15, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v17, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v17, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v15, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v15, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload @@ -568,53 +571,53 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v17, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v17, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v17, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v17, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v15, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v15, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v15, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v15, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v17, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v17, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v15, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v15, s5, 8 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v17, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v17, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v17, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v17, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v17, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v17, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v15, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v15, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v15, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v15, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v15, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v15, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v15, 0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v17, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v17, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v15, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v15, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v17, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v17, s5, 10 +; W64-O0-NEXT: v_writelane_b32 v15, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v15, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -639,48 +642,48 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v17, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v17, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v17, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v17, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v15, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v15, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v15, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v15, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v17, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v17, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v15, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v15, s5, 16 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v17, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v17, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v17, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v17, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v17, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v17, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v15, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v15, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v15, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v15, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v15, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v15, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v15, 0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v15, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v15, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) @@ -689,7 +692,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -997,45 +1000,45 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v8, v5 -; W64-O0-NEXT: v_mov_b32_e32 v5, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v8, v11 +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v9, v3 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v10, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v11, v1 -; W64-O0-NEXT: v_mov_b32_e32 v5, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v5, v3 +; W64-O0-NEXT: s_waitcnt vmcnt(1) +; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_mov_b32_e32 v7, v1 ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v8 -; W64-O0-NEXT: v_mov_b32_e32 v2, v6 -; W64-O0-NEXT: v_mov_b32_e32 v3, v7 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v11 -; W64-O0-NEXT: v_mov_b32_e32 v7, v10 -; W64-O0-NEXT: v_mov_b32_e32 v8, v9 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v1, v11 +; W64-O0-NEXT: v_mov_b32_e32 v2, v10 +; W64-O0-NEXT: v_mov_b32_e32 v3, v9 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v5, v12 -; W64-O0-NEXT: s_waitcnt vmcnt(6) -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v9, v12 +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(6) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND @@ -1106,7 +1109,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: @@ -1118,13 +1121,13 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] ; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 ; W64-O0-NEXT: v_writelane_b32 v13, s5, 11 @@ -1143,7 +1146,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v13, s5, 12 ; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v13, s4, 13 ; W64-O0-NEXT: v_writelane_b32 v13, s5, 14 @@ -1199,12 +1202,12 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: v_readlane_b32 s10, v13, 17 ; W64-O0-NEXT: v_readlane_b32 s11, v13, 18 ; W64-O0-NEXT: v_readlane_b32 s6, v13, 12 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: @@ -1215,9 +1218,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: v_readlane_b32 s4, v13, 13 ; W64-O0-NEXT: v_readlane_b32 s5, v13, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload @@ -1228,12 +1231,12 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 12efca7dcadb5..2462414992e36 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 32862f73d2f29..b761f689d6af5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -388,8 +388,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 @@ -422,8 +421,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index fc799162e999a..e29be2b744874 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -767,32 +767,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_ ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16 -; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15 -; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14 -; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13 -; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12 -; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11 -; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10 -; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9 -; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8 -; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7 -; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6 -; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5 -; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4 -; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3 -; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2 -; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a4, a5 +; CHECK-NEXT: v_accvgpr_mov_b32 a5, a6 +; CHECK-NEXT: v_accvgpr_mov_b32 a6, a7 +; CHECK-NEXT: v_accvgpr_mov_b32 a7, a8 +; CHECK-NEXT: v_accvgpr_mov_b32 a8, a9 +; CHECK-NEXT: v_accvgpr_mov_b32 a9, a10 +; CHECK-NEXT: v_accvgpr_mov_b32 a10, a11 +; CHECK-NEXT: v_accvgpr_mov_b32 a11, a12 +; CHECK-NEXT: v_accvgpr_mov_b32 a12, a13 +; CHECK-NEXT: v_accvgpr_mov_b32 a13, a14 +; CHECK-NEXT: v_accvgpr_mov_b32 a14, a15 +; CHECK-NEXT: v_accvgpr_mov_b32 a15, a16 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll index 16ad8ec367b2d..39c7d283ea4e8 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll @@ -26,9 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_i32 (ptr addrspace(1) %a, ptr addrsp ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]] ; GFX11-REAL16-NEXT: [[S_PACK_HL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_HL_B32_B16 [[COPY4]], killed [[COPY5]] - ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_PACK_HL_B32_B16_]], %subreg.sub0, [[S_PACK_HL_B32_B16_]], %subreg.sub1 - ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY6]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_PACK_HL_B32_B16_]], %subreg.sub0, [[S_PACK_HL_B32_B16_]], %subreg.sub1 + ; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE1]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) ; GFX11-REAL16-NEXT: S_ENDPGM 0 ; ; GFX11-FAKE16-LABEL: name: scalar_to_vector_i32 @@ -47,9 +46,8 @@ define amdgpu_kernel void @scalar_to_vector_i32 (ptr addrspace(1) %a, ptr addrsp ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 117834498 ; GFX11-FAKE16-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], killed [[S_MOV_B32_2]], implicit $exec - ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_PERM_B32_e64_]], %subreg.sub0, [[V_PERM_B32_e64_]], %subreg.sub1 - ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_PERM_B32_e64_]], %subreg.sub0, [[V_PERM_B32_e64_]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) ; GFX11-FAKE16-NEXT: S_ENDPGM 0 entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index c5732531f5423..a29dc34c56d3a 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -75,7 +75,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: ; GFX908: NumSgprs: 64 ; GFX908-GCNTRACKERS: NumSgprs: 64 -; GFX908: NumVgprs: 43 +; GFX908: NumVgprs: 41 ; GFX908-GCNTRACKERS: NumVgprs: 39 ; GFX908: Occupancy: 5 ; GFX908-GCNTRACKERS: Occupancy: 6 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 74a6d7fe39362..fdb20f372ab8d 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1458,8 +1458,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1650,8 +1649,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll index 192bd2073886a..7c6ae6cad5ae7 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll @@ -5,25 +5,27 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { ; GCN-LABEL: copy_to_vreg_1: ; GCN: ; %bb.0: ; %._crit_edge ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, 1, s4 +; GCN-NEXT: s_sub_i32 s6, 1, s4 ; GCN-NEXT: s_cmp_lt_u32 s4, 2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_and_b64 s[2:3], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s3, s5, 1 +; GCN-NEXT: s_cselect_b32 s2, s6, 1 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GCN-NEXT: s_addc_u32 s0, 1, 0 -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: s_cmp_ge_u32 s3, s4 -; GCN-NEXT: s_cselect_b32 s4, s0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_cmp_ge_u32 s2, s4 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_cselect_b32 s4, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_cmp_lg_u64 0, 0 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_mov_b64_e32 v[0:1], 0 ; GCN-NEXT: s_branch .LBB0_3 ; GCN-NEXT: .LBB0_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -48,7 +50,7 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { ; GCN-NEXT: ; %bb.4: ; %pred.store.if ; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec -; GCN-NEXT: global_store_byte v[2:3], v1, off +; GCN-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: .LBB0_5: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] @@ -62,7 +64,7 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.7: ; %pred.store.if41 ; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; GCN-NEXT: global_store_byte v[2:3], v1, off +; GCN-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %DummyReturnBlock ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index 4d864ad15b411..58c8462ec795c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -576,9 +576,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a4, a5 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a4 -; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a5 -; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[16:17] +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -590,9 +590,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) ; GFX940-NEXT: ; def a4, a5 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_accvgpr_mov_b32 a1, a4 -; GFX940-NEXT: v_accvgpr_mov_b32 a0, a5 -; GFX940-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GFX940-NEXT: v_accvgpr_read_b32 v2, a5 +; GFX940-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll index 51dc9a51ec9d0..0b20caea9cd95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -474,27 +478,31 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll index 7f8f2dbbb09a1..2ecbf9622a259 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -474,27 +478,31 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 35cf10f1135c9..16202a708fd5c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -167,15 +167,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -183,15 +183,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -269,27 +269,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1232,34 +1232,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1288,34 +1289,35 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1838,26 +1840,27 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1886,34 +1889,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1994,28 +1998,29 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index befc1126d6fa4..131204c8a6430 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -411,25 +411,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -946,28 +948,29 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -993,27 +996,29 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1376,13 +1381,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1392,13 +1398,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1983,13 +1990,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,13 +2007,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2830,28 +2839,29 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2878,27 +2888,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2975,13 +2987,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2991,13 +3004,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3133,27 +3147,29 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3574,13 +3590,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3590,13 +3607,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3735,27 +3753,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll index 50c69de069986..c5a08f098b4c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index 51d45922893b3..f36f23a3a932d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -167,15 +167,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -183,15 +183,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -269,27 +269,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1232,34 +1232,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1288,34 +1289,35 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1838,26 +1840,27 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1886,34 +1889,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1994,28 +1998,29 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 89e6a2918a68c..eacf77c931a68 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -411,25 +411,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -946,28 +948,29 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -993,27 +996,29 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1376,13 +1381,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1392,13 +1398,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1983,13 +1990,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,13 +2007,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2830,28 +2839,29 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2878,27 +2888,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2975,13 +2987,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2991,13 +3004,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3133,27 +3147,29 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3574,13 +3590,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3590,13 +3607,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3735,27 +3753,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll index 11d1897d0449f..92d6c95c26599 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll index a15fc3212f474..bbca5039bb02c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -695,28 +699,32 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1555,28 +1563,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2148,28 +2160,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll index fe132493ce536..8757639c501d2 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -695,28 +699,32 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1555,28 +1563,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2148,28 +2160,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index 25e087bd922ac..1434189e6bda1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -167,15 +167,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -183,15 +183,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -269,27 +269,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1232,34 +1232,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1288,34 +1289,35 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1838,26 +1840,27 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1886,34 +1889,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:5] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1994,28 +1998,29 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index 62b9da9fedb95..0c5fe591656bb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -411,25 +411,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -946,28 +948,29 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -993,27 +996,29 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1376,13 +1381,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1392,13 +1398,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1983,13 +1990,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,13 +2007,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2830,28 +2839,29 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2878,27 +2888,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2975,13 +2987,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2991,13 +3004,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3133,27 +3147,29 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3574,13 +3590,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3590,13 +3607,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3735,27 +3753,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll index 834f03f013ba1..c9f194d873e35 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -965,26 +965,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1011,26 +1014,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1241,28 +1247,29 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1289,26 +1296,29 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1335,28 +1345,29 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1838,14 +1849,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,14 +1866,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2667,14 +2678,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,14 +2695,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4664,28 +4675,29 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4713,26 +4725,29 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4760,28 +4775,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4808,26 +4824,27 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4861,14 +4878,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4878,15 +4895,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5081,28 +5097,29 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5129,29 +5146,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5675,14 +5692,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5692,15 +5709,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5899,28 +5915,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5947,29 +5964,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6706,29 +6723,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7484,29 +7501,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index df148f299a165..c7092f04a23ed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2380,28 +2380,29 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 5828e40595f9f..86211d4e3c3d8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1664,35 +1674,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_0_0: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3581,17 +3604,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3599,17 +3622,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3868,16 +3891,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3885,16 +3908,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3926,16 +3949,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3943,16 +3967,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4040,36 +4065,37 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4351,31 +4377,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4403,29 +4429,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4456,17 +4480,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4474,17 +4498,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4573,36 +4597,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4630,31 +4653,31 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4682,31 +4705,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4734,28 +4757,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4786,16 +4810,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,17 +4828,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5369,28 +5394,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5421,17 +5445,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5439,17 +5463,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5536,36 +5560,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5592,28 +5615,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5641,28 +5667,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5690,27 +5719,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5742,16 +5773,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,17 +5791,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5801,17 +5833,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5819,17 +5851,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5915,27 +5947,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6497,30 +6533,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6593,28 +6630,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6646,16 +6684,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6663,17 +6702,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6819,28 +6858,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index edc540edb3ad1..d5bd41397c4f0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 9d3affa6da266..03503c9dac197 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2386,28 +2386,29 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 3a659e1753e97..fc6d2a84d4892 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1664,35 +1674,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_0_0: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3581,17 +3604,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3599,17 +3622,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3868,16 +3891,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3885,16 +3908,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3926,16 +3949,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3943,16 +3967,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4040,36 +4065,37 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4351,31 +4377,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4403,29 +4429,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4456,17 +4480,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4474,17 +4498,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4573,36 +4597,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4630,31 +4653,31 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4682,31 +4705,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4734,28 +4757,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4786,16 +4810,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,17 +4828,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5369,28 +5394,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5421,17 +5445,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5439,17 +5463,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5536,36 +5560,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5592,28 +5615,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5641,28 +5667,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5690,27 +5719,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5742,16 +5773,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,17 +5791,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5801,17 +5833,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5819,17 +5851,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5915,27 +5947,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6497,30 +6533,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6593,28 +6630,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6646,16 +6684,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6663,17 +6702,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6819,28 +6858,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index 983afa566e2c1..ee2f94b90ffa9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll index ac7d9557ce765..21ec9acf6317d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -624,15 +628,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -642,18 +646,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -752,15 +756,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -770,15 +774,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -953,33 +957,39 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1206,18 +1216,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,18 +1237,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1556,15 +1566,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,18 +1584,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1663,33 +1673,33 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1961,17 +1971,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1984,17 +1994,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2447,33 +2457,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2508,15 +2524,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2526,15 +2542,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2577,17 +2593,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2597,21 +2613,21 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2649,18 +2665,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2670,19 +2686,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2717,15 +2733,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,15 +2751,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2776,13 +2792,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2808,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3134,33 +3150,33 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3374,39 +3390,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index 8dd4a40d00680..615b382aa355a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1388,13 +1388,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,13 +1406,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3637,33 +3641,33 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4787,13 +4791,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,13 +4809,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5169,15 +5177,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5187,15 +5195,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5532,15 +5540,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,15 +5558,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6255,17 +6263,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,17 +6283,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6970,33 +6978,33 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7344,15 +7352,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7362,15 +7370,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index ea9ef2f1ac94a..32f6e00716e37 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll index b30af835a7882..ee3b303f88471 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -291,27 +291,31 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -624,15 +628,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -642,18 +646,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -752,15 +756,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -770,15 +774,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -953,33 +957,39 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1206,18 +1216,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,18 +1237,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1556,15 +1566,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,18 +1584,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1663,33 +1673,33 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1961,17 +1971,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1984,17 +1994,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2447,33 +2457,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2508,15 +2524,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2526,15 +2542,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2577,17 +2593,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2597,21 +2613,21 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2649,18 +2665,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2670,19 +2686,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2717,15 +2733,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2735,15 +2751,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2776,13 +2792,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2808,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3134,33 +3150,33 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3374,39 +3390,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index e6ac554735eee..09e497259766e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -1126,15 +1126,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1144,15 +1144,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1388,13 +1388,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,13 +1406,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3637,33 +3641,33 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4787,13 +4791,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,13 +4809,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5169,15 +5177,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5187,15 +5195,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5532,15 +5540,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,15 +5558,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6255,17 +6263,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v2 ; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,17 +6283,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v2 ; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6970,33 +6978,33 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7344,15 +7352,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v4 ; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7362,15 +7370,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v8, v4 ; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index ce1c54129f706..257af574366a6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -8328,15 +8328,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8346,15 +8346,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11254,15 +11254,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11272,15 +11272,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 3b5690562c38a..90a1b99dc7c14 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -272,27 +272,27 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2386,28 +2386,29 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index f1c1e4b20f242..bcb20e85b2e94 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -255,15 +255,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +271,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -413,27 +414,27 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -553,15 +554,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +571,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -609,16 +612,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +629,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -719,27 +722,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -766,28 +771,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1031,31 +1037,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1083,28 +1089,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1180,28 +1189,29 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1325,31 +1335,31 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1375,28 +1385,29 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1488,15 +1499,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1506,15 +1517,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1545,34 +1556,33 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1606,15 +1616,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1624,15 +1634,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1664,35 +1674,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_0_0: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1724,17 +1734,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,17 +1752,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1843,15 +1853,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1861,15 +1871,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1901,16 +1911,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1928,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2018,17 +2028,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2036,17 +2046,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2078,17 +2088,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,17 +2106,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2603,16 +2613,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2620,16 +2631,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2661,16 +2673,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,17 +2691,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2720,16 +2733,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2737,16 +2751,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2834,16 +2849,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2851,17 +2867,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2893,15 +2909,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2926,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2949,15 +2966,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2965,16 +2984,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3006,16 +3026,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3023,17 +3044,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3065,16 +3086,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3082,17 +3104,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3124,16 +3146,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3141,17 +3164,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3581,17 +3604,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3599,17 +3622,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3868,16 +3891,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3885,16 +3908,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3926,16 +3949,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3943,16 +3967,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4040,36 +4065,37 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:8] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v8 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4351,31 +4377,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4403,29 +4429,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4456,17 +4480,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4474,17 +4498,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4573,36 +4597,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4630,31 +4653,31 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4682,31 +4705,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4734,28 +4757,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4786,16 +4810,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4803,17 +4828,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5369,28 +5394,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5421,17 +5445,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5439,17 +5463,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5536,36 +5560,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5592,28 +5615,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5641,28 +5667,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5690,27 +5719,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5742,16 +5773,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,17 +5791,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5801,17 +5833,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5819,17 +5851,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5915,27 +5947,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6497,30 +6533,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6593,28 +6630,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6646,16 +6684,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6663,17 +6702,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6819,28 +6858,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index eeab42ae40d7f..1684b94cfd452 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -963,26 +963,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1009,26 +1012,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1440,31 +1446,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1734,28 +1740,31 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2785,14 +2794,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2802,14 +2812,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4138,14 +4149,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4155,14 +4167,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4196,14 +4209,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4213,14 +4227,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5448,34 +5463,37 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7172,28 +7190,31 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7271,28 +7292,29 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7559,30 +7581,31 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7660,28 +7683,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7763,14 +7789,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,14 +7807,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8041,31 +8069,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8946,28 +8974,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9047,14 +9078,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,14 +9096,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9105,14 +9138,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9122,14 +9156,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9277,28 +9312,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9327,31 +9365,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10319,15 +10357,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,15 +10375,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10553,31 +10591,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11777,31 +11815,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11830,31 +11868,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 76bf9176143ff..101787abf8ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -401,11 +401,11 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -414,10 +414,10 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -482,11 +482,11 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -495,10 +495,10 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -512,11 +512,11 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-LABEL: v_swap_select_sint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -525,10 +525,10 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 862e2dd2de051..02d2e6c1473ab 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1608,8 +1608,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1798,8 +1797,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index b31cc36a5f7c6..983acfc2c0699 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -660,11 +660,11 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -673,10 +673,10 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -741,11 +741,11 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -754,10 +754,10 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -823,11 +823,11 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -836,10 +836,10 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX942-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll index 8af4a8de7b266..9053b8f9d4fe4 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll @@ -37,8 +37,8 @@ define <2 x i64> @test_add2x64(ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] -; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; CHECK-NEXT: v_mov_b32_e32 v2, 48 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -91,8 +91,8 @@ define <3 x i64> @test_add3x64(ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] -; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; CHECK-NEXT: v_mov_b32_e32 v2, 48 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll index 67264e9ed973b..599557cad2eb4 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll @@ -90,7 +90,8 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX900-NEXT: v_mov_b32_e32 v28, v35 ; GFX900-NEXT: v_mov_b32_e32 v29, v34 ; GFX900-NEXT: v_mov_b32_e32 v30, v33 -; GFX900-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v31, v32 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX900-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -190,7 +191,8 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-NEXT: v_mov_b32_e32 v28, v35 ; GFX906-NEXT: v_mov_b32_e32 v29, v34 ; GFX906-NEXT: v_mov_b32_e32 v30, v33 -; GFX906-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -290,7 +292,8 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: v_mov_b32_e32 v28, v35 ; GFX908-NEXT: v_mov_b32_e32 v29, v34 ; GFX908-NEXT: v_mov_b32_e32 v30, v33 -; GFX908-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse @@ -307,7 +310,6 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90a-LABEL: test_tuple: @@ -390,7 +392,8 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_mov_b32_e32 v28, v35 ; GFX90a-NEXT: v_mov_b32_e32 v29, v34 ; GFX90a-NEXT: v_mov_b32_e32 v30, v33 -; GFX90a-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec +; GFX90a-NEXT: s_waitcnt vmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v31, v32 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse @@ -407,7 +410,6 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse -; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_setpc_b64 s[30:31] %2 = shufflevector <16 x i64> %0, <16 x i64> zeroinitializer, <8 x i32> ret i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 183e4267e582e..3bc67562012e5 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -640,7 +640,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI-NEXT: GLOBAL_STORE_DWORD undef %34:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; SI-NEXT: GLOBAL_STORE_DWORD undef %35:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 81e17400973a4..e9a0671ead4e0 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -877,171 +877,91 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s17 -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s18 -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s19 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s20 -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s21 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v47, s23 -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v40 -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v39 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v38 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v37 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v36 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(5) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v47 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42 -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41 -; GFX9-O0-NEXT: s_waitcnt vmcnt(5) +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40 -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v27, v39 -; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v28, v38 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v29, v37 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -1178,19 +1098,12 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 8d50df7050636..7d8400c943be0 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -3,15 +3,15 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: -; CHECK: SelectionDAG has 25 nodes: +; CHECK: SelectionDAG has 26 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t51: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> -; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<42>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t51, t33 ; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24 ; CHECK-NEXT: t39: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>